/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 227 by ph10, Tue Aug 21 15:00:15 2007 UTC revision 409 by ph10, Sat Mar 28 17:10:56 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 43  pattern matching using an NFA algorithm, Line 43  pattern matching using an NFA algorithm,
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
46  #include <config.h>  #include "config.h"
47  #endif  #endif
48    
49  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
# Line 158  printf("\n"); Line 158  printf("\n");
158    
159  if (length > md->end_subject - eptr) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
160    
161  /* Separate the caselesss case for speed */  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162    properly if Unicode properties are supported. Otherwise, we can check only
163    ASCII characters. */
164    
165  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
166    {    {
167    #ifdef SUPPORT_UTF8
168    #ifdef SUPPORT_UCP
169      if (md->utf8)
170        {
171        USPTR endptr = eptr + length;
172        while (eptr < endptr)
173          {
174          int c, d;
175          GETCHARINC(c, eptr);
176          GETCHARINC(d, p);
177          if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178          }
179        }
180      else
181    #endif
182    #endif
183    
184      /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185      is no UCP support. */
186    
187    while (length-- > 0)    while (length-- > 0)
188      if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189    }    }
190    
191    /* In the caseful case, we can just compare the bytes, whether or not we
192    are in UTF-8 mode. */
193    
194  else  else
195    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
# Line 296  typedef struct heapframe { Line 322  typedef struct heapframe {
322    
323    /* Function arguments that may change */    /* Function arguments that may change */
324    
325    const uschar *Xeptr;    USPTR Xeptr;
326    const uschar *Xecode;    const uschar *Xecode;
327    const uschar *Xmstart;    USPTR Xmstart;
328    int Xoffset_top;    int Xoffset_top;
329    long int Xims;    long int Xims;
330    eptrblock *Xeptrb;    eptrblock *Xeptrb;
# Line 307  typedef struct heapframe { Line 333  typedef struct heapframe {
333    
334    /* Function local variables */    /* Function local variables */
335    
336    const uschar *Xcallpat;    USPTR Xcallpat;
337    const uschar *Xcharptr;  #ifdef SUPPORT_UTF8
338    const uschar *Xdata;    USPTR Xcharptr;
339    const uschar *Xnext;  #endif
340    const uschar *Xpp;    USPTR Xdata;
341    const uschar *Xprev;    USPTR Xnext;
342    const uschar *Xsaved_eptr;    USPTR Xpp;
343      USPTR Xprev;
344      USPTR Xsaved_eptr;
345    
346    recursion_info Xnew_recursive;    recursion_info Xnew_recursive;
347    
# Line 334  typedef struct heapframe { Line 362  typedef struct heapframe {
362    uschar Xocchars[8];    uschar Xocchars[8];
363  #endif  #endif
364    
365      int Xcodelink;
366    int Xctype;    int Xctype;
367    unsigned int Xfc;    unsigned int Xfc;
368    int Xfi;    int Xfi;
# Line 399  Returns: MATCH_MATCH if matched Line 428  Returns: MATCH_MATCH if matched
428  */  */
429    
430  static int  static int
431  match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,  match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
432    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
433    int flags, unsigned int rdepth)    int flags, unsigned int rdepth)
434  {  {
# Line 413  register unsigned int c; /* Character Line 442  register unsigned int c; /* Character
442  register BOOL utf8;        /* Local copy of UTF-8 flag for speed */  register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
443    
444  BOOL minimize, possessive; /* Quantifier options */  BOOL minimize, possessive; /* Quantifier options */
445    int condcode;
446    
447  /* When recursion is not being used, all "local" variables that have to be  /* When recursion is not being used, all "local" variables that have to be
448  preserved over calls to RMATCH() are part of a "frame" which is obtained from  preserved over calls to RMATCH() are part of a "frame" which is obtained from
# Line 455  HEAP_RECURSE: Line 485  HEAP_RECURSE:
485  #define charptr            frame->Xcharptr  #define charptr            frame->Xcharptr
486  #endif  #endif
487  #define callpat            frame->Xcallpat  #define callpat            frame->Xcallpat
488    #define codelink           frame->Xcodelink
489  #define data               frame->Xdata  #define data               frame->Xdata
490  #define next               frame->Xnext  #define next               frame->Xnext
491  #define pp                 frame->Xpp  #define pp                 frame->Xpp
# Line 535  int oclength; Line 566  int oclength;
566  uschar occhars[8];  uschar occhars[8];
567  #endif  #endif
568    
569    int codelink;
570  int ctype;  int ctype;
571  int length;  int length;
572  int max;  int max;
# Line 761  for (;;) Line 793  for (;;)
793    
794      case OP_COND:      case OP_COND:
795      case OP_SCOND:      case OP_SCOND:
796      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      codelink= GET(ecode, 1);
797    
798        /* Because of the way auto-callout works during compile, a callout item is
799        inserted between OP_COND and an assertion condition. */
800    
801        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
802          {
803          if (pcre_callout != NULL)
804            {
805            pcre_callout_block cb;
806            cb.version          = 1;   /* Version 1 of the callout block */
807            cb.callout_number   = ecode[LINK_SIZE+2];
808            cb.offset_vector    = md->offset_vector;
809            cb.subject          = (PCRE_SPTR)md->start_subject;
810            cb.subject_length   = md->end_subject - md->start_subject;
811            cb.start_match      = mstart - md->start_subject;
812            cb.current_position = eptr - md->start_subject;
813            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
814            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
815            cb.capture_top      = offset_top/2;
816            cb.capture_last     = md->capture_last;
817            cb.callout_data     = md->callout_data;
818            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
819            if (rrc < 0) RRETURN(rrc);
820            }
821          ecode += _pcre_OP_lengths[OP_CALLOUT];
822          }
823    
824        condcode = ecode[LINK_SIZE+1];
825    
826        /* Now see what the actual condition is */
827    
828        if (condcode == OP_RREF)         /* Recursion test */
829        {        {
830        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
831        condition = md->recursive != NULL &&        condition = md->recursive != NULL &&
# Line 769  for (;;) Line 833  for (;;)
833        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
834        }        }
835    
836      else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF)    /* Group used test */
837        {        {
838        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
839        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
840        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
841        }        }
842    
843      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */      else if (condcode == OP_DEF)     /* DEFINE - always false */
844        {        {
845        condition = FALSE;        condition = FALSE;
846        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 803  for (;;) Line 867  for (;;)
867        else        else
868          {          {
869          condition = FALSE;          condition = FALSE;
870          ecode += GET(ecode, 1);          ecode += codelink;
871          }          }
872        }        }
873    
# Line 826  for (;;) Line 890  for (;;)
890          goto TAIL_RECURSE;          goto TAIL_RECURSE;
891          }          }
892        }        }
893      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
894        {        {
895        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
896        }        }
# Line 1049  for (;;) Line 1113  for (;;)
1113          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1114            {            {
1115            DPRINTF(("Recursion gave error %d\n", rrc));            DPRINTF(("Recursion gave error %d\n", rrc));
1116              if (new_recursive.offset_save != stacksave)
1117                (pcre_free)(new_recursive.offset_save);
1118            RRETURN(rrc);            RRETURN(rrc);
1119            }            }
1120    
# Line 1148  for (;;) Line 1214  for (;;)
1214      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1215      break;      break;
1216    
1217      /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating      /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1218      that it may occur zero times. It may repeat infinitely, or not at all -      indicating that it may occur zero times. It may repeat infinitely, or not
1219      i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper      at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1220      repeat limits are compiled as a number of copies, with the optional ones      with fixed upper repeat limits are compiled as a number of copies, with the
1221      preceded by BRAZERO or BRAMINZERO. */      optional ones preceded by BRAZERO or BRAMINZERO. */
1222    
1223      case OP_BRAZERO:      case OP_BRAZERO:
1224        {        {
# Line 1174  for (;;) Line 1240  for (;;)
1240        }        }
1241      break;      break;
1242    
1243        case OP_SKIPZERO:
1244          {
1245          next = ecode+1;
1246          do next += GET(next,1); while (*next == OP_ALT);
1247          ecode = next + 1 + LINK_SIZE;
1248          }
1249        break;
1250    
1251      /* End of a group, repeated or non-repeating. */      /* End of a group, repeated or non-repeating. */
1252    
1253      case OP_KET:      case OP_KET:
# Line 1387  for (;;) Line 1461  for (;;)
1461          {          {
1462          if (eptr == md->start_subject) prev_is_word = FALSE; else          if (eptr == md->start_subject) prev_is_word = FALSE; else
1463            {            {
1464            const uschar *lastptr = eptr - 1;            USPTR lastptr = eptr - 1;
1465            while((*lastptr & 0xc0) == 0x80) lastptr--;            while((*lastptr & 0xc0) == 0x80) lastptr--;
1466            GETCHAR(c, lastptr);            GETCHAR(c, lastptr);
1467            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;            prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
# Line 1421  for (;;) Line 1495  for (;;)
1495      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
1496    
1497      case OP_ANY:      case OP_ANY:
1498      if ((ims & PCRE_DOTALL) == 0)      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1499        {      /* Fall through */
1500        if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
1501        }      case OP_ALLANY:
1502      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503      if (utf8)      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
1504      ecode++;      ecode++;
1505      break;      break;
1506    
# Line 1526  for (;;) Line 1599  for (;;)
1599        case 0x000d:        case 0x000d:
1600        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1601        break;        break;
1602    
1603        case 0x000a:        case 0x000a:
1604          break;
1605    
1606        case 0x000b:        case 0x000b:
1607        case 0x000c:        case 0x000c:
1608        case 0x0085:        case 0x0085:
1609        case 0x2028:        case 0x2028:
1610        case 0x2029:        case 0x2029:
1611          if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1612        break;        break;
1613        }        }
1614      ecode++;      ecode++;
# Line 1642  for (;;) Line 1719  for (;;)
1719      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1720      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1721        {        {
1722        int chartype, script;        const ucd_record *prop = GET_UCD(c);
       int category = _pcre_ucp_findprop(c, &chartype, &script);  
1723    
1724        switch(ecode[1])        switch(ecode[1])
1725          {          {
# Line 1652  for (;;) Line 1728  for (;;)
1728          break;          break;
1729    
1730          case PT_LAMP:          case PT_LAMP:
1731          if ((chartype == ucp_Lu ||          if ((prop->chartype == ucp_Lu ||
1732               chartype == ucp_Ll ||               prop->chartype == ucp_Ll ||
1733               chartype == ucp_Lt) == (op == OP_NOTPROP))               prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1734            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1735           break;           break;
1736    
1737          case PT_GC:          case PT_GC:
1738          if ((ecode[2] != category) == (op == OP_PROP))          if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1739            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1740          break;          break;
1741    
1742          case PT_PC:          case PT_PC:
1743          if ((ecode[2] != chartype) == (op == OP_PROP))          if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1744            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1745          break;          break;
1746    
1747          case PT_SC:          case PT_SC:
1748          if ((ecode[2] != script) == (op == OP_PROP))          if ((ecode[2] != prop->script) == (op == OP_PROP))
1749            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1750          break;          break;
1751    
# Line 1688  for (;;) Line 1764  for (;;)
1764      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1765      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1766        {        {
1767        int chartype, script;        int category = UCD_CATEGORY(c);
       int category = _pcre_ucp_findprop(c, &chartype, &script);  
1768        if (category == ucp_M) RRETURN(MATCH_NOMATCH);        if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1769        while (eptr < md->end_subject)        while (eptr < md->end_subject)
1770          {          {
# Line 1698  for (;;) Line 1773  for (;;)
1773            {            {
1774            GETCHARLEN(c, eptr, len);            GETCHARLEN(c, eptr, len);
1775            }            }
1776          category = _pcre_ucp_findprop(c, &chartype, &script);          category = UCD_CATEGORY(c);
1777          if (category != ucp_M) break;          if (category != ucp_M) break;
1778          eptr += len;          eptr += len;
1779          }          }
# Line 1719  for (;;) Line 1794  for (;;)
1794      case OP_REF:      case OP_REF:
1795        {        {
1796        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
1797        ecode += 3;                                 /* Advance past item */        ecode += 3;
1798    
1799          /* If the reference is unset, there are two possibilities:
1800    
1801          (a) In the default, Perl-compatible state, set the length to be longer
1802          than the amount of subject left; this ensures that every attempt at a
1803          match fails. We can't just fail here, because of the possibility of
1804          quantifiers with zero minima.
1805    
1806        /* If the reference is unset, set the length to be longer than the amount        (b) If the JavaScript compatibility flag is set, set the length to zero
1807        of subject left; this ensures that every attempt at a match fails. We        so that the back reference matches an empty string.
1808        can't just fail here, because of the possibility of quantifiers with zero  
1809        minima. */        Otherwise, set the length to the length of what was matched by the
1810          referenced subpattern. */
1811        length = (offset >= offset_top || md->offset_vector[offset] < 0)?  
1812          md->end_subject - eptr + 1 :        if (offset >= offset_top || md->offset_vector[offset] < 0)
1813          md->offset_vector[offset+1] - md->offset_vector[offset];          length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1814          else
1815            length = md->offset_vector[offset+1] - md->offset_vector[offset];
1816    
1817        /* Set up for repetition, or handle the non-repeated case */        /* Set up for repetition, or handle the non-repeated case */
1818    
# Line 2003  for (;;) Line 2087  for (;;)
2087    
2088    
2089      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2090      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2091        mode, because Unicode properties are supported in non-UTF-8 mode. */
2092    
2093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2094      case OP_XCLASS:      case OP_XCLASS:
# Line 2045  for (;;) Line 2130  for (;;)
2130        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2131          {          {
2132          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2133          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2134          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2135          }          }
2136    
# Line 2064  for (;;) Line 2149  for (;;)
2149            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2150            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2151            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2152            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2153            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2154            }            }
2155          /* Control never gets here */          /* Control never gets here */
# Line 2079  for (;;) Line 2164  for (;;)
2164            {            {
2165            int len = 1;            int len = 1;
2166            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2167            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2168            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2169            eptr += len;            eptr += len;
2170            }            }
# Line 2154  for (;;) Line 2239  for (;;)
2239          if (fc != dc)          if (fc != dc)
2240            {            {
2241  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2242            if (dc != _pcre_ucp_othercase(fc))            if (dc != UCD_OTHERCASE(fc))
2243  #endif  #endif
2244              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
2245            }            }
# Line 2245  for (;;) Line 2330  for (;;)
2330  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2331          unsigned int othercase;          unsigned int othercase;
2332          if ((ims & PCRE_CASELESS) != 0 &&          if ((ims & PCRE_CASELESS) != 0 &&
2333              (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)              (othercase = UCD_OTHERCASE(fc)) != fc)
2334            oclength = _pcre_ord2utf8(othercase, occhars);            oclength = _pcre_ord2utf8(othercase, occhars);
2335          else oclength = 0;          else oclength = 0;
2336  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 2565  for (;;) Line 2650  for (;;)
2650              {              {
2651              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2652              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2654              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2655              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2656              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2657                RRETURN(MATCH_NOMATCH);  
2658              }              }
2659            }            }
2660          else          else
# Line 2674  for (;;) Line 2760  for (;;)
2760              {              {
2761              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2762              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2765              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2766              }              }
2767            }            }
2768          else          else
# Line 2850  for (;;) Line 2936  for (;;)
2936              {              {
2937              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2939              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
2940              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
2941                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
2942                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 2863  for (;;) Line 2949  for (;;)
2949              {              {
2950              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2952              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
2953              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2954                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2955              }              }
# Line 2874  for (;;) Line 2960  for (;;)
2960              {              {
2961              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2963              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
2964              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2965                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2966              }              }
# Line 2885  for (;;) Line 2971  for (;;)
2971              {              {
2972              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2974              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
2975              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2976                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2977              }              }
# Line 2904  for (;;) Line 2990  for (;;)
2990          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2991            {            {
2992            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
2993            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
2994            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2995            while (eptr < md->end_subject)            while (eptr < md->end_subject)
2996              {              {
# Line 2913  for (;;) Line 2999  for (;;)
2999                {                {
3000                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3001                }                }
3002              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3003              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3004              eptr += len;              eptr += len;
3005              }              }
# Line 2931  for (;;) Line 3017  for (;;)
3017          case OP_ANY:          case OP_ANY:
3018          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3019            {            {
3020            if (eptr >= md->end_subject ||            if (eptr >= md->end_subject || IS_NEWLINE(eptr))
                ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))  
3021              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3022            eptr++;            eptr++;
3023            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3024            }            }
3025          break;          break;
3026    
3027            case OP_ALLANY:
3028            for (i = 1; i <= min; i++)
3029              {
3030              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031              eptr++;
3032              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3033              }
3034            break;
3035    
3036          case OP_ANYBYTE:          case OP_ANYBYTE:
3037          eptr += min;          eptr += min;
3038          break;          break;
# Line 2954  for (;;) Line 3048  for (;;)
3048              case 0x000d:              case 0x000d:
3049              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3050              break;              break;
3051    
3052              case 0x000a:              case 0x000a:
3053                break;
3054    
3055              case 0x000b:              case 0x000b:
3056              case 0x000c:              case 0x000c:
3057              case 0x0085:              case 0x0085:
3058              case 0x2028:              case 0x2028:
3059              case 0x2029:              case 0x2029:
3060                if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3061              break;              break;
3062              }              }
3063            }            }
# Line 3143  for (;;) Line 3241  for (;;)
3241        switch(ctype)        switch(ctype)
3242          {          {
3243          case OP_ANY:          case OP_ANY:
3244          if ((ims & PCRE_DOTALL) == 0)          for (i = 1; i <= min; i++)
3245            {            {
3246            for (i = 1; i <= min; i++)            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3247              {            eptr++;
             if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
             eptr++;  
             }  
3248            }            }
3249          else eptr += min;          break;
3250    
3251            case OP_ALLANY:
3252            eptr += min;
3253          break;          break;
3254    
3255          case OP_ANYBYTE:          case OP_ANYBYTE:
# Line 3172  for (;;) Line 3270  for (;;)
3270              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3271              break;              break;
3272              case 0x000a:              case 0x000a:
3273                break;
3274    
3275              case 0x000b:              case 0x000b:
3276              case 0x000c:              case 0x000c:
3277              case 0x0085:              case 0x0085:
3278                if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3279              break;              break;
3280              }              }
3281            }            }
# Line 3314  for (;;) Line 3415  for (;;)
3415              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3418              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3419              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
3420                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
3421                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 3329  for (;;) Line 3430  for (;;)
3430              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3433              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3434              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
3435                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3436              }              }
# Line 3342  for (;;) Line 3443  for (;;)
3443              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3446              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3447              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
3448                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3449              }              }
# Line 3355  for (;;) Line 3456  for (;;)
3456              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3458              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3459              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
3460              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
3461                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3462              }              }
# Line 3377  for (;;) Line 3478  for (;;)
3478            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3480            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
3481            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
3482            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3483            while (eptr < md->end_subject)            while (eptr < md->end_subject)
3484              {              {
# Line 3386  for (;;) Line 3487  for (;;)
3487                {                {
3488                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3489                }                }
3490              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3491              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3492              eptr += len;              eptr += len;
3493              }              }
# Line 3405  for (;;) Line 3506  for (;;)
3506            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3507            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3509                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
                 IS_NEWLINE(eptr)))  
3510              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3511    
3512            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
3513            switch(ctype)            switch(ctype)
3514              {              {
3515              case OP_ANY:        /* This is the DOTALL case */              case OP_ANY:        /* This is the non-NL case */
3516              break;              case OP_ALLANY:
   
3517              case OP_ANYBYTE:              case OP_ANYBYTE:
3518              break;              break;
3519    
# Line 3426  for (;;) Line 3525  for (;;)
3525                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3526                break;                break;
3527                case 0x000a:                case 0x000a:
3528                  break;
3529    
3530                case 0x000b:                case 0x000b:
3531                case 0x000c:                case 0x000c:
3532                case 0x0085:                case 0x0085:
3533                case 0x2028:                case 0x2028:
3534                case 0x2029:                case 0x2029:
3535                  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3536                break;                break;
3537                }                }
3538              break;              break;
# Line 3563  for (;;) Line 3665  for (;;)
3665            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3666            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3667            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3668                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3669              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3670    
3671            c = *eptr++;            c = *eptr++;
3672            switch(ctype)            switch(ctype)
3673              {              {
3674              case OP_ANY:   /* This is the DOTALL case */              case OP_ANY:     /* This is the non-NL case */
3675              break;              case OP_ALLANY:
   
3676              case OP_ANYBYTE:              case OP_ANYBYTE:
3677              break;              break;
3678    
# Line 3582  for (;;) Line 3683  for (;;)
3683                case 0x000d:                case 0x000d:
3684                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3685                break;                break;
3686    
3687                case 0x000a:                case 0x000a:
3688                  break;
3689    
3690                case 0x000b:                case 0x000b:
3691                case 0x000c:                case 0x000c:
3692                case 0x0085:                case 0x0085:
3693                  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3694                break;                break;
3695                }                }
3696              break;              break;
# Line 3700  for (;;) Line 3805  for (;;)
3805              int len = 1;              int len = 1;
3806              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3807              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3808              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3809              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
3810                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
3811                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 3715  for (;;) Line 3820  for (;;)
3820              int len = 1;              int len = 1;
3821              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3822              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3823              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3824              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
3825                break;                break;
3826              eptr+= len;              eptr+= len;
# Line 3728  for (;;) Line 3833  for (;;)
3833              int len = 1;              int len = 1;
3834              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3835              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3836              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3837              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
3838                break;                break;
3839              eptr+= len;              eptr+= len;
# Line 3741  for (;;) Line 3846  for (;;)
3846              int len = 1;              int len = 1;
3847              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3848              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3849              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
3850              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
3851                break;                break;
3852              eptr+= len;              eptr+= len;
# Line 3770  for (;;) Line 3875  for (;;)
3875            {            {
3876            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
3877            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
3878            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
3879            if (prop_category == ucp_M) break;            if (prop_category == ucp_M) break;
3880            while (eptr < md->end_subject)            while (eptr < md->end_subject)
3881              {              {
# Line 3779  for (;;) Line 3884  for (;;)
3884                {                {
3885                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3886                }                }
3887              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3888              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3889              eptr += len;              eptr += len;
3890              }              }
# Line 3801  for (;;) Line 3906  for (;;)
3906                BACKCHAR(eptr);                BACKCHAR(eptr);
3907                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3908                }                }
3909              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3910              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3911              eptr--;              eptr--;
3912              }              }
# Line 3821  for (;;) Line 3926  for (;;)
3926            case OP_ANY:            case OP_ANY:
3927            if (max < INT_MAX)            if (max < INT_MAX)
3928              {              {
3929              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
               {  
               for (i = min; i < max; i++)  
                 {  
                 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
               }  
             else  
3930                {                {
3931                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932                  {                eptr++;
3933                  if (eptr >= md->end_subject) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3934                }                }
3935              }              }
3936    
# Line 3845  for (;;) Line 3938  for (;;)
3938    
3939            else            else
3940              {              {
3941              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
3942                {                {
3943                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3944                  {                eptr++;
3945                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3946                }                }
3947              else              }
3948              break;
3949    
3950              case OP_ALLANY:
3951              if (max < INT_MAX)
3952                {
3953                for (i = min; i < max; i++)
3954                {                {
3955                eptr = md->end_subject;                if (eptr >= md->end_subject) break;
3956                  eptr++;
3957                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3958                }                }
3959              }              }
3960              else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
3961            break;            break;
3962    
3963            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 3883  for (;;) Line 3982  for (;;)
3982                }                }
3983              else              else
3984                {                {
3985                if (c != 0x000a && c != 0x000b && c != 0x000c &&                if (c != 0x000a &&
3986                    c != 0x0085 && c != 0x2028 && c != 0x2029)                    (md->bsr_anycrlf ||
3987                       (c != 0x000b && c != 0x000c &&
3988                        c != 0x0085 && c != 0x2028 && c != 0x2029)))
3989                  break;                  break;
3990                eptr += len;                eptr += len;
3991                }                }
# Line 4044  for (;;) Line 4145  for (;;)
4145          switch(ctype)          switch(ctype)
4146            {            {
4147            case OP_ANY:            case OP_ANY:
4148            if ((ims & PCRE_DOTALL) == 0)            for (i = min; i < max; i++)
4149              {              {
4150              for (i = min; i < max; i++)              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4151                {              eptr++;
               if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
               eptr++;  
               }  
             break;  
4152              }              }
4153            /* For DOTALL case, fall through and treat as \C */            break;
4154    
4155              case OP_ALLANY:
4156            case OP_ANYBYTE:            case OP_ANYBYTE:
4157            c = max - min;            c = max - min;
4158            if (c > (unsigned int)(md->end_subject - eptr))            if (c > (unsigned int)(md->end_subject - eptr))
# Line 4074  for (;;) Line 4172  for (;;)
4172                }                }
4173              else              else
4174                {                {
4175                if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)                if (c != 0x000a &&
4176                      (md->bsr_anycrlf ||
4177                        (c != 0x000b && c != 0x000c && c != 0x0085)))
4178                  break;                  break;
4179                eptr++;                eptr++;
4180                }                }
# Line 4224  HEAP_RETURN: Line 4324  HEAP_RETURN:
4324  switch (frame->Xwhere)  switch (frame->Xwhere)
4325    {    {
4326    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4327    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4328    LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4329    LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4330    LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)    LBL(53) LBL(54)
4331    LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47) LBL(48)  #ifdef SUPPORT_UTF8
4332    LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4333      LBL(32) LBL(34) LBL(42) LBL(46)
4334    #ifdef SUPPORT_UCP
4335      LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4336    #endif  /* SUPPORT_UCP */
4337    #endif  /* SUPPORT_UTF8 */
4338    default:    default:
4339    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4340    return PCRE_ERROR_INTERNAL;    return PCRE_ERROR_INTERNAL;
# Line 4321  Returns: > 0 => success; value Line 4426  Returns: > 0 => success; value
4426                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
4427  */  */
4428    
4429  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4430  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4431    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4432    int offsetcount)    int offsetcount)
# Line 4408  if (re->magic_number != MAGIC_NUMBER) Line 4513  if (re->magic_number != MAGIC_NUMBER)
4513  /* Set up other data */  /* Set up other data */
4514    
4515  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4516  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
4517  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
4518    
4519  /* The code starts after the real_pcre block and the capture name table. */  /* The code starts after the real_pcre block and the capture name table. */
# Line 4423  end_subject = md->end_subject; Line 4528  end_subject = md->end_subject;
4528    
4529  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4530  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4531    md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4532    
4533  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
4534  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
# Line 4435  md->recursive = NULL; Line 4541  md->recursive = NULL;
4541  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
4542  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
4543    
4544    /* Handle different \R options. */
4545    
4546    switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4547      {
4548      case 0:
4549      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4550        md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4551      else
4552    #ifdef BSR_ANYCRLF
4553      md->bsr_anycrlf = TRUE;
4554    #else
4555      md->bsr_anycrlf = FALSE;
4556    #endif
4557      break;
4558    
4559      case PCRE_BSR_ANYCRLF:
4560      md->bsr_anycrlf = TRUE;
4561      break;
4562    
4563      case PCRE_BSR_UNICODE:
4564      md->bsr_anycrlf = FALSE;
4565      break;
4566    
4567      default: return PCRE_ERROR_BADNEWLINE;
4568      }
4569    
4570  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
4571  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
4572    
4573  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4574         PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4575    {    {
4576    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4577    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4578    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4579    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4580         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4581    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4582    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4583    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4478  else Line 4610  else
4610  /* Partial matching is supported only for a restricted set of regexes at the  /* Partial matching is supported only for a restricted set of regexes at the
4611  moment. */  moment. */
4612    
4613  if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4614    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
4615    
4616  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
# Line 4487  back the character offset. */ Line 4619  back the character offset. */
4619  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4620  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4621    {    {
4622    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
4623      return PCRE_ERROR_BADUTF8;      return PCRE_ERROR_BADUTF8;
4624    if (start_offset > 0 && start_offset < length)    if (start_offset > 0 && start_offset < length)
4625      {      {
4626      int tb = ((uschar *)subject)[start_offset];      int tb = ((USPTR)subject)[start_offset];
4627      if (tb > 127)      if (tb > 127)
4628        {        {
4629        tb &= 0xc0;        tb &= 0xc0;
# Line 4555  studied, there may be a bitmap of possib Line 4687  studied, there may be a bitmap of possib
4687    
4688  if (!anchored)  if (!anchored)
4689    {    {
4690    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
4691      {      {
4692      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
4693      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 4570  if (!anchored) Line 4702  if (!anchored)
4702  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
4703  character" set. */  character" set. */
4704    
4705  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
4706    {    {
4707    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
4708    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 4597  for(;;) Line 4729  for(;;)
4729      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4730      }      }
4731    
4732    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4733    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4734    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4735    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4736    the match fails at the newline, later code breaks this loop. */    this loop. */
4737    
4738    if (firstline)    if (firstline)
4739      {      {
4740      USPTR t = start_match;      USPTR t = start_match;
4741    #ifdef SUPPORT_UTF8
4742        if (utf8)
4743          {
4744          while (t < md->end_subject && !IS_NEWLINE(t))
4745            {
4746            t++;
4747            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4748            }
4749          }
4750        else
4751    #endif
4752      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4753      end_subject = t;      end_subject = t;
4754      }      }
4755    
4756    /* Now test for a unique first byte */    /* There are some optimizations that avoid running the match if a known
4757      starting point is not found, or if a known later character is not present.
4758      However, there is an option that disables these, for testing and for ensuring
4759      that all callouts do actually occur. */
4760    
4761    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4762      {      {
4763      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
       while (start_match < end_subject &&  
              md->lcc[*start_match] != first_byte)  
         start_match++;  
     else  
       while (start_match < end_subject && *start_match != first_byte)  
         start_match++;  
     }  
4764    
4765    /* Or to just after a linebreak for a multiline match if possible */      if (first_byte >= 0)
4766          {
4767          if (first_byte_caseless)
4768            while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4769              start_match++;
4770          else
4771            while (start_match < end_subject && *start_match != first_byte)
4772              start_match++;
4773          }
4774    
4775    else if (startline)      /* Or to just after a linebreak for a multiline match */
4776      {  
4777      if (start_match > md->start_subject + start_offset)      else if (startline)
4778        {        {
4779        while (start_match <= end_subject && !WAS_NEWLINE(start_match))        if (start_match > md->start_subject + start_offset)
4780          start_match++;          {
4781    #ifdef SUPPORT_UTF8
4782            if (utf8)
4783              {
4784              while (start_match < end_subject && !WAS_NEWLINE(start_match))
4785                {
4786                start_match++;
4787                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4788                  start_match++;
4789                }
4790              }
4791            else
4792    #endif
4793            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4794              start_match++;
4795    
4796            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4797            and we are now at a LF, advance the match position by one more character.
4798            */
4799    
4800        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,          if (start_match[-1] == CHAR_CR &&
4801        and we are now at a LF, advance the match position by one more character.               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4802        */               start_match < end_subject &&
4803                 *start_match == CHAR_NL)
4804        if (start_match[-1] == '\r' &&            start_match++;
4805             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&          }
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4806        }        }
     }  
4807    
4808    /* Or to a non-unique first char after study */      /* Or to a non-unique first byte after study */
4809    
4810    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4811        {        {
4812        register unsigned int c = *start_match;        while (start_match < end_subject)
4813        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;          {
4814            register unsigned int c = *start_match;
4815            if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4816              else break;
4817            }
4818        }        }
4819      }      }   /* Starting optimizations */
4820    
4821    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4822    
# Line 4665  for(;;) Line 4828  for(;;)
4828    printf("\n");    printf("\n");
4829  #endif  #endif
4830    
4831    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4832    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4833    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4834    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4835    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4836    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4837    and backing off on a match.    autoincrement and backing off on a match.
4838    
4839    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4840    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4841    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4842    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4843      long.
4844    
4845    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4846    */    disabling is explicitly requested. */
4847    
4848    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4849          req_byte >= 0 &&
4850        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4851        !md->partial)        !md->partial)
4852      {      {
# Line 4789  for(;;) Line 4954  for(;;)
4954    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4955    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4956    
4957    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4958        start_match < end_subject &&        start_match < end_subject &&
4959        *start_match == '\n' &&        *start_match == CHAR_NL &&
4960        (re->options & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4961          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4962           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||
4963           md->nllen == 2))           md->nllen == 2))

Legend:
Removed from v.227  
changed lines
  Added in v.409

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12