/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 227 by ph10, Tue Aug 21 15:00:15 2007 UTC revision 395 by ph10, Fri Mar 20 11:22:42 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 43  pattern matching using an NFA algorithm, Line 43  pattern matching using an NFA algorithm,
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
46  #include <config.h>  #include "config.h"
47  #endif  #endif
48    
49  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
# Line 158  printf("\n"); Line 158  printf("\n");
158    
159  if (length > md->end_subject - eptr) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
160    
161  /* Separate the caselesss case for speed */  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162    properly if Unicode properties are supported. Otherwise, we can check only
163    ASCII characters. */
164    
165  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
166    {    {
167    #ifdef SUPPORT_UTF8
168    #ifdef SUPPORT_UCP
169      if (md->utf8)
170        {
171        USPTR endptr = eptr + length;
172        while (eptr < endptr)
173          {
174          int c, d;
175          GETCHARINC(c, eptr);
176          GETCHARINC(d, p);
177          if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178          }
179        }
180      else
181    #endif
182    #endif
183    
184      /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185      is no UCP support. */
186    
187    while (length-- > 0)    while (length-- > 0)
188      if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189    }    }
190    
191    /* In the caseful case, we can just compare the bytes, whether or not we
192    are in UTF-8 mode. */
193    
194  else  else
195    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
# Line 609  for (;;) Line 635  for (;;)
635    {    {
636    minimize = possessive = FALSE;    minimize = possessive = FALSE;
637    op = *ecode;    op = *ecode;
638    
639    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
640    matching at least one subject character. */    matching at least one subject character. */
641    
# Line 761  for (;;) Line 787  for (;;)
787    
788      case OP_COND:      case OP_COND:
789      case OP_SCOND:      case OP_SCOND:
790        /* Because of the way auto-callout works during compile, a callout item is
791        inserted between OP_COND and an assertion condition. */
792    
793        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794          {
795          if (pcre_callout != NULL)
796            {
797            pcre_callout_block cb;
798            cb.version          = 1;   /* Version 1 of the callout block */
799            cb.callout_number   = ecode[LINK_SIZE+2];
800            cb.offset_vector    = md->offset_vector;
801            cb.subject          = (PCRE_SPTR)md->start_subject;
802            cb.subject_length   = md->end_subject - md->start_subject;
803            cb.start_match      = mstart - md->start_subject;
804            cb.current_position = eptr - md->start_subject;
805            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807            cb.capture_top      = offset_top/2;
808            cb.capture_last     = md->capture_last;
809            cb.callout_data     = md->callout_data;
810            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811            if (rrc < 0) RRETURN(rrc);
812            }
813          ecode += _pcre_OP_lengths[OP_CALLOUT];
814          }
815    
816        /* Now see what the actual condition is */
817    
818      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
819        {        {
820        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
# Line 826  for (;;) Line 880  for (;;)
880          goto TAIL_RECURSE;          goto TAIL_RECURSE;
881          }          }
882        }        }
883      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
884        {        {
885        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
886        }        }
# Line 1148  for (;;) Line 1202  for (;;)
1202      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1203      break;      break;
1204    
1205      /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating      /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1206      that it may occur zero times. It may repeat infinitely, or not at all -      indicating that it may occur zero times. It may repeat infinitely, or not
1207      i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper      at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1208      repeat limits are compiled as a number of copies, with the optional ones      with fixed upper repeat limits are compiled as a number of copies, with the
1209      preceded by BRAZERO or BRAMINZERO. */      optional ones preceded by BRAZERO or BRAMINZERO. */
1210    
1211      case OP_BRAZERO:      case OP_BRAZERO:
1212        {        {
# Line 1174  for (;;) Line 1228  for (;;)
1228        }        }
1229      break;      break;
1230    
1231        case OP_SKIPZERO:
1232          {
1233          next = ecode+1;
1234          do next += GET(next,1); while (*next == OP_ALT);
1235          ecode = next + 1 + LINK_SIZE;
1236          }
1237        break;
1238    
1239      /* End of a group, repeated or non-repeating. */      /* End of a group, repeated or non-repeating. */
1240    
1241      case OP_KET:      case OP_KET:
# Line 1421  for (;;) Line 1483  for (;;)
1483      /* Match a single character type; inline for speed */      /* Match a single character type; inline for speed */
1484    
1485      case OP_ANY:      case OP_ANY:
1486      if ((ims & PCRE_DOTALL) == 0)      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1487        {      /* Fall through */
1488        if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
1489        }      case OP_ALLANY:
1490      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491      if (utf8)      if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
1492      ecode++;      ecode++;
1493      break;      break;
1494    
# Line 1526  for (;;) Line 1587  for (;;)
1587        case 0x000d:        case 0x000d:
1588        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;        if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1589        break;        break;
1590    
1591        case 0x000a:        case 0x000a:
1592          break;
1593    
1594        case 0x000b:        case 0x000b:
1595        case 0x000c:        case 0x000c:
1596        case 0x0085:        case 0x0085:
1597        case 0x2028:        case 0x2028:
1598        case 0x2029:        case 0x2029:
1599          if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1600        break;        break;
1601        }        }
1602      ecode++;      ecode++;
# Line 1642  for (;;) Line 1707  for (;;)
1707      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1709        {        {
1710        int chartype, script;        const ucd_record *prop = GET_UCD(c);
       int category = _pcre_ucp_findprop(c, &chartype, &script);  
1711    
1712        switch(ecode[1])        switch(ecode[1])
1713          {          {
# Line 1652  for (;;) Line 1716  for (;;)
1716          break;          break;
1717    
1718          case PT_LAMP:          case PT_LAMP:
1719          if ((chartype == ucp_Lu ||          if ((prop->chartype == ucp_Lu ||
1720               chartype == ucp_Ll ||               prop->chartype == ucp_Ll ||
1721               chartype == ucp_Lt) == (op == OP_NOTPROP))               prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1722            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1723           break;           break;
1724    
1725          case PT_GC:          case PT_GC:
1726          if ((ecode[2] != category) == (op == OP_PROP))          if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1727            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1728          break;          break;
1729    
1730          case PT_PC:          case PT_PC:
1731          if ((ecode[2] != chartype) == (op == OP_PROP))          if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1732            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1733          break;          break;
1734    
1735          case PT_SC:          case PT_SC:
1736          if ((ecode[2] != script) == (op == OP_PROP))          if ((ecode[2] != prop->script) == (op == OP_PROP))
1737            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1738          break;          break;
1739    
# Line 1688  for (;;) Line 1752  for (;;)
1752      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1753      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1754        {        {
1755        int chartype, script;        int category = UCD_CATEGORY(c);
       int category = _pcre_ucp_findprop(c, &chartype, &script);  
1756        if (category == ucp_M) RRETURN(MATCH_NOMATCH);        if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1757        while (eptr < md->end_subject)        while (eptr < md->end_subject)
1758          {          {
# Line 1698  for (;;) Line 1761  for (;;)
1761            {            {
1762            GETCHARLEN(c, eptr, len);            GETCHARLEN(c, eptr, len);
1763            }            }
1764          category = _pcre_ucp_findprop(c, &chartype, &script);          category = UCD_CATEGORY(c);
1765          if (category != ucp_M) break;          if (category != ucp_M) break;
1766          eptr += len;          eptr += len;
1767          }          }
# Line 1719  for (;;) Line 1782  for (;;)
1782      case OP_REF:      case OP_REF:
1783        {        {
1784        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */        offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
1785        ecode += 3;                                 /* Advance past item */        ecode += 3;
1786    
1787          /* If the reference is unset, there are two possibilities:
1788    
1789          (a) In the default, Perl-compatible state, set the length to be longer
1790          than the amount of subject left; this ensures that every attempt at a
1791          match fails. We can't just fail here, because of the possibility of
1792          quantifiers with zero minima.
1793    
1794        /* If the reference is unset, set the length to be longer than the amount        (b) If the JavaScript compatibility flag is set, set the length to zero
1795        of subject left; this ensures that every attempt at a match fails. We        so that the back reference matches an empty string.
1796        can't just fail here, because of the possibility of quantifiers with zero  
1797        minima. */        Otherwise, set the length to the length of what was matched by the
1798          referenced subpattern. */
1799        length = (offset >= offset_top || md->offset_vector[offset] < 0)?  
1800          md->end_subject - eptr + 1 :        if (offset >= offset_top || md->offset_vector[offset] < 0)
1801          md->offset_vector[offset+1] - md->offset_vector[offset];          length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1802          else
1803            length = md->offset_vector[offset+1] - md->offset_vector[offset];
1804    
1805        /* Set up for repetition, or handle the non-repeated case */        /* Set up for repetition, or handle the non-repeated case */
1806    
# Line 2003  for (;;) Line 2075  for (;;)
2075    
2076    
2077      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2078      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2079        mode, because Unicode properties are supported in non-UTF-8 mode. */
2080    
2081  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2082      case OP_XCLASS:      case OP_XCLASS:
# Line 2045  for (;;) Line 2118  for (;;)
2118        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2119          {          {
2120          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2121          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2122          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2123          }          }
2124    
# Line 2064  for (;;) Line 2137  for (;;)
2137            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2138            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2139            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2140            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2141            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2142            }            }
2143          /* Control never gets here */          /* Control never gets here */
# Line 2079  for (;;) Line 2152  for (;;)
2152            {            {
2153            int len = 1;            int len = 1;
2154            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2155            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2156            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2157            eptr += len;            eptr += len;
2158            }            }
# Line 2154  for (;;) Line 2227  for (;;)
2227          if (fc != dc)          if (fc != dc)
2228            {            {
2229  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2230            if (dc != _pcre_ucp_othercase(fc))            if (dc != UCD_OTHERCASE(fc))
2231  #endif  #endif
2232              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
2233            }            }
# Line 2245  for (;;) Line 2318  for (;;)
2318  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2319          unsigned int othercase;          unsigned int othercase;
2320          if ((ims & PCRE_CASELESS) != 0 &&          if ((ims & PCRE_CASELESS) != 0 &&
2321              (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)              (othercase = UCD_OTHERCASE(fc)) != fc)
2322            oclength = _pcre_ord2utf8(othercase, occhars);            oclength = _pcre_ord2utf8(othercase, occhars);
2323          else oclength = 0;          else oclength = 0;
2324  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
# Line 2565  for (;;) Line 2638  for (;;)
2638              {              {
2639              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2640              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2641                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2643              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2644              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2645                RRETURN(MATCH_NOMATCH);  
2646              }              }
2647            }            }
2648          else          else
# Line 2674  for (;;) Line 2748  for (;;)
2748              {              {
2749              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2750              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2752              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2753              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2754              }              }
2755            }            }
2756          else          else
# Line 2850  for (;;) Line 2924  for (;;)
2924              {              {
2925              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2926              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2927              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
2928              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
2929                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
2930                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 2863  for (;;) Line 2937  for (;;)
2937              {              {
2938              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2939              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2940              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
2941              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2942                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2943              }              }
# Line 2874  for (;;) Line 2948  for (;;)
2948              {              {
2949              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2950              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2951              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
2952              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2953                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2954              }              }
# Line 2885  for (;;) Line 2959  for (;;)
2959              {              {
2960              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2961              GETCHARINCTEST(c, eptr);              GETCHARINCTEST(c, eptr);
2962              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
2963              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2964                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2965              }              }
# Line 2904  for (;;) Line 2978  for (;;)
2978          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2979            {            {
2980            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
2981            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
2982            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2983            while (eptr < md->end_subject)            while (eptr < md->end_subject)
2984              {              {
# Line 2913  for (;;) Line 2987  for (;;)
2987                {                {
2988                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
2989                }                }
2990              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
2991              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
2992              eptr += len;              eptr += len;
2993              }              }
# Line 2931  for (;;) Line 3005  for (;;)
3005          case OP_ANY:          case OP_ANY:
3006          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3007            {            {
3008            if (eptr >= md->end_subject ||            if (eptr >= md->end_subject || IS_NEWLINE(eptr))
                ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))  
3009              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3010            eptr++;            eptr++;
3011            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3012            }            }
3013          break;          break;
3014    
3015            case OP_ALLANY:
3016            for (i = 1; i <= min; i++)
3017              {
3018              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3019              eptr++;
3020              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3021              }
3022            break;
3023    
3024          case OP_ANYBYTE:          case OP_ANYBYTE:
3025          eptr += min;          eptr += min;
3026          break;          break;
# Line 2954  for (;;) Line 3036  for (;;)
3036              case 0x000d:              case 0x000d:
3037              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3038              break;              break;
3039    
3040              case 0x000a:              case 0x000a:
3041                break;
3042    
3043              case 0x000b:              case 0x000b:
3044              case 0x000c:              case 0x000c:
3045              case 0x0085:              case 0x0085:
3046              case 0x2028:              case 0x2028:
3047              case 0x2029:              case 0x2029:
3048                if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3049              break;              break;
3050              }              }
3051            }            }
# Line 3143  for (;;) Line 3229  for (;;)
3229        switch(ctype)        switch(ctype)
3230          {          {
3231          case OP_ANY:          case OP_ANY:
3232          if ((ims & PCRE_DOTALL) == 0)          for (i = 1; i <= min; i++)
3233            {            {
3234            for (i = 1; i <= min; i++)            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3235              {            eptr++;
             if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);  
             eptr++;  
             }  
3236            }            }
3237          else eptr += min;          break;
3238    
3239            case OP_ALLANY:
3240            eptr += min;
3241          break;          break;
3242    
3243          case OP_ANYBYTE:          case OP_ANYBYTE:
# Line 3172  for (;;) Line 3258  for (;;)
3258              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3259              break;              break;
3260              case 0x000a:              case 0x000a:
3261                break;
3262    
3263              case 0x000b:              case 0x000b:
3264              case 0x000c:              case 0x000c:
3265              case 0x0085:              case 0x0085:
3266                if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3267              break;              break;
3268              }              }
3269            }            }
# Line 3314  for (;;) Line 3403  for (;;)
3403              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3404              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3405              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3406              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3407              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
3408                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
3409                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 3329  for (;;) Line 3418  for (;;)
3418              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3420              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3421              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3422              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
3423                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3424              }              }
# Line 3342  for (;;) Line 3431  for (;;)
3431              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3432              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3433              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3434              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3435              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
3436                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3437              }              }
# Line 3355  for (;;) Line 3444  for (;;)
3444              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3446              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
3447              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
3448              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
3449                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
3450              }              }
# Line 3377  for (;;) Line 3466  for (;;)
3466            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3468            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
3469            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
3470            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);            if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3471            while (eptr < md->end_subject)            while (eptr < md->end_subject)
3472              {              {
# Line 3386  for (;;) Line 3475  for (;;)
3475                {                {
3476                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3477                }                }
3478              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3479              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3480              eptr += len;              eptr += len;
3481              }              }
# Line 3405  for (;;) Line 3494  for (;;)
3494            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3495            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3497                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
                 IS_NEWLINE(eptr)))  
3498              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3499    
3500            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
3501            switch(ctype)            switch(ctype)
3502              {              {
3503              case OP_ANY:        /* This is the DOTALL case */              case OP_ANY:        /* This is the non-NL case */
3504              break;              case OP_ALLANY:
   
3505              case OP_ANYBYTE:              case OP_ANYBYTE:
3506              break;              break;
3507    
# Line 3426  for (;;) Line 3513  for (;;)
3513                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3514                break;                break;
3515                case 0x000a:                case 0x000a:
3516                  break;
3517    
3518                case 0x000b:                case 0x000b:
3519                case 0x000c:                case 0x000c:
3520                case 0x0085:                case 0x0085:
3521                case 0x2028:                case 0x2028:
3522                case 0x2029:                case 0x2029:
3523                  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3524                break;                break;
3525                }                }
3526              break;              break;
# Line 3563  for (;;) Line 3653  for (;;)
3653            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3654            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3655            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3656                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))                 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3657              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3658    
3659            c = *eptr++;            c = *eptr++;
3660            switch(ctype)            switch(ctype)
3661              {              {
3662              case OP_ANY:   /* This is the DOTALL case */              case OP_ANY:     /* This is the non-NL case */
3663              break;              case OP_ALLANY:
   
3664              case OP_ANYBYTE:              case OP_ANYBYTE:
3665              break;              break;
3666    
# Line 3582  for (;;) Line 3671  for (;;)
3671                case 0x000d:                case 0x000d:
3672                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3673                break;                break;
3674    
3675                case 0x000a:                case 0x000a:
3676                  break;
3677    
3678                case 0x000b:                case 0x000b:
3679                case 0x000c:                case 0x000c:
3680                case 0x0085:                case 0x0085:
3681                  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3682                break;                break;
3683                }                }
3684              break;              break;
# Line 3700  for (;;) Line 3793  for (;;)
3793              int len = 1;              int len = 1;
3794              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3795              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3796              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3797              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
3798                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
3799                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
# Line 3715  for (;;) Line 3808  for (;;)
3808              int len = 1;              int len = 1;
3809              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3810              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3811              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3812              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
3813                break;                break;
3814              eptr+= len;              eptr+= len;
# Line 3728  for (;;) Line 3821  for (;;)
3821              int len = 1;              int len = 1;
3822              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3823              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3824              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_chartype = UCD_CHARTYPE(c);
3825              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
3826                break;                break;
3827              eptr+= len;              eptr+= len;
# Line 3741  for (;;) Line 3834  for (;;)
3834              int len = 1;              int len = 1;
3835              if (eptr >= md->end_subject) break;              if (eptr >= md->end_subject) break;
3836              GETCHARLEN(c, eptr, len);              GETCHARLEN(c, eptr, len);
3837              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_script = UCD_SCRIPT(c);
3838              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
3839                break;                break;
3840              eptr+= len;              eptr+= len;
# Line 3770  for (;;) Line 3863  for (;;)
3863            {            {
3864            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
3865            GETCHARINCTEST(c, eptr);            GETCHARINCTEST(c, eptr);
3866            prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);            prop_category = UCD_CATEGORY(c);
3867            if (prop_category == ucp_M) break;            if (prop_category == ucp_M) break;
3868            while (eptr < md->end_subject)            while (eptr < md->end_subject)
3869              {              {
# Line 3779  for (;;) Line 3872  for (;;)
3872                {                {
3873                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3874                }                }
3875              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3876              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3877              eptr += len;              eptr += len;
3878              }              }
# Line 3801  for (;;) Line 3894  for (;;)
3894                BACKCHAR(eptr);                BACKCHAR(eptr);
3895                GETCHARLEN(c, eptr, len);                GETCHARLEN(c, eptr, len);
3896                }                }
3897              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = UCD_CATEGORY(c);
3898              if (prop_category != ucp_M) break;              if (prop_category != ucp_M) break;
3899              eptr--;              eptr--;
3900              }              }
# Line 3821  for (;;) Line 3914  for (;;)
3914            case OP_ANY:            case OP_ANY:
3915            if (max < INT_MAX)            if (max < INT_MAX)
3916              {              {
3917              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
               {  
               for (i = min; i < max; i++)  
                 {  
                 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
               }  
             else  
3918                {                {
3919                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3920                  {                eptr++;
3921                  if (eptr >= md->end_subject) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3922                }                }
3923              }              }
3924    
# Line 3845  for (;;) Line 3926  for (;;)
3926    
3927            else            else
3928              {              {
3929              if ((ims & PCRE_DOTALL) == 0)              for (i = min; i < max; i++)
3930                {                {
3931                for (i = min; i < max; i++)                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932                  {                eptr++;
3933                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 eptr++;  
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;  
                 }  
3934                }                }
3935              else              }
3936              break;
3937    
3938              case OP_ALLANY:
3939              if (max < INT_MAX)
3940                {
3941                for (i = min; i < max; i++)
3942                {                {
3943                eptr = md->end_subject;                if (eptr >= md->end_subject) break;
3944                  eptr++;
3945                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946                }                }
3947              }              }
3948              else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
3949            break;            break;
3950    
3951            /* The byte case is the same as non-UTF8 */            /* The byte case is the same as non-UTF8 */
# Line 3883  for (;;) Line 3970  for (;;)
3970                }                }
3971              else              else
3972                {                {
3973                if (c != 0x000a && c != 0x000b && c != 0x000c &&                if (c != 0x000a &&
3974                    c != 0x0085 && c != 0x2028 && c != 0x2029)                    (md->bsr_anycrlf ||
3975                       (c != 0x000b && c != 0x000c &&
3976                        c != 0x0085 && c != 0x2028 && c != 0x2029)))
3977                  break;                  break;
3978                eptr += len;                eptr += len;
3979                }                }
# Line 4044  for (;;) Line 4133  for (;;)
4133          switch(ctype)          switch(ctype)
4134            {            {
4135            case OP_ANY:            case OP_ANY:
4136            if ((ims & PCRE_DOTALL) == 0)            for (i = min; i < max; i++)
4137              {              {
4138              for (i = min; i < max; i++)              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4139                {              eptr++;
               if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;  
               eptr++;  
               }  
             break;  
4140              }              }
4141            /* For DOTALL case, fall through and treat as \C */            break;
4142    
4143              case OP_ALLANY:
4144            case OP_ANYBYTE:            case OP_ANYBYTE:
4145            c = max - min;            c = max - min;
4146            if (c > (unsigned int)(md->end_subject - eptr))            if (c > (unsigned int)(md->end_subject - eptr))
# Line 4074  for (;;) Line 4160  for (;;)
4160                }                }
4161              else              else
4162                {                {
4163                if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)                if (c != 0x000a &&
4164                      (md->bsr_anycrlf ||
4165                        (c != 0x000b && c != 0x000c && c != 0x0085)))
4166                  break;                  break;
4167                eptr++;                eptr++;
4168                }                }
# Line 4224  HEAP_RETURN: Line 4312  HEAP_RETURN:
4312  switch (frame->Xwhere)  switch (frame->Xwhere)
4313    {    {
4314    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)    LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4315    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)    LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4316    LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)    LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4317    LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)    LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4318    LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)    LBL(53) LBL(54)
4319    LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47) LBL(48)  #ifdef SUPPORT_UTF8
4320    LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54)    LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4321      LBL(32) LBL(34) LBL(42) LBL(46)
4322    #ifdef SUPPORT_UCP
4323      LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4324    #endif  /* SUPPORT_UCP */
4325    #endif  /* SUPPORT_UTF8 */
4326    default:    default:
4327    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4328    return PCRE_ERROR_INTERNAL;    return PCRE_ERROR_INTERNAL;
# Line 4321  Returns: > 0 => success; value Line 4414  Returns: > 0 => success; value
4414                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
4415  */  */
4416    
4417  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4418  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4419    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4420    int offsetcount)    int offsetcount)
# Line 4408  if (re->magic_number != MAGIC_NUMBER) Line 4501  if (re->magic_number != MAGIC_NUMBER)
4501  /* Set up other data */  /* Set up other data */
4502    
4503  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;  anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4504  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
4505  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
4506    
4507  /* The code starts after the real_pcre block and the capture name table. */  /* The code starts after the real_pcre block and the capture name table. */
# Line 4423  end_subject = md->end_subject; Line 4516  end_subject = md->end_subject;
4516    
4517  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4518  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4519    md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4520    
4521  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
4522  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
# Line 4435  md->recursive = NULL; Line 4529  md->recursive = NULL;
4529  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
4530  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
4531    
4532    /* Handle different \R options. */
4533    
4534    switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4535      {
4536      case 0:
4537      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4538        md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4539      else
4540    #ifdef BSR_ANYCRLF
4541      md->bsr_anycrlf = TRUE;
4542    #else
4543      md->bsr_anycrlf = FALSE;
4544    #endif
4545      break;
4546    
4547      case PCRE_BSR_ANYCRLF:
4548      md->bsr_anycrlf = TRUE;
4549      break;
4550    
4551      case PCRE_BSR_UNICODE:
4552      md->bsr_anycrlf = FALSE;
4553      break;
4554    
4555      default: return PCRE_ERROR_BADNEWLINE;
4556      }
4557    
4558  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
4559  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
4560    
4561  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4562         PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4563    {    {
4564    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4565    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4566    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4567    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4568         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4569    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4570    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4571    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4478  else Line 4598  else
4598  /* Partial matching is supported only for a restricted set of regexes at the  /* Partial matching is supported only for a restricted set of regexes at the
4599  moment. */  moment. */
4600    
4601  if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)  if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4602    return PCRE_ERROR_BADPARTIAL;    return PCRE_ERROR_BADPARTIAL;
4603    
4604  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
# Line 4555  studied, there may be a bitmap of possib Line 4675  studied, there may be a bitmap of possib
4675    
4676  if (!anchored)  if (!anchored)
4677    {    {
4678    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
4679      {      {
4680      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
4681      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 4570  if (!anchored) Line 4690  if (!anchored)
4690  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
4691  character" set. */  character" set. */
4692    
4693  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
4694    {    {
4695    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
4696    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 4597  for(;;) Line 4717  for(;;)
4717      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4718      }      }
4719    
4720    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4721    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4722    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4723    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4724    the match fails at the newline, later code breaks this loop. */    this loop. */
4725    
4726    if (firstline)    if (firstline)
4727      {      {
4728      USPTR t = start_match;      USPTR t = start_match;
4729    #ifdef SUPPORT_UTF8
4730        if (utf8)
4731          {
4732          while (t < md->end_subject && !IS_NEWLINE(t))
4733            {
4734            t++;
4735            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4736            }
4737          }
4738        else
4739    #endif
4740      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4741      end_subject = t;      end_subject = t;
4742      }      }
4743    
4744    /* Now test for a unique first byte */    /* There are some optimizations that avoid running the match if a known
4745      starting point is not found, or if a known later character is not present.
4746      However, there is an option that disables these, for testing and for ensuring
4747      that all callouts do actually occur. */
4748    
4749    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4750      {      {
4751      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
4752        while (start_match < end_subject &&  
4753               md->lcc[*start_match] != first_byte)      if (first_byte >= 0)
4754          start_match++;        {
4755      else        if (first_byte_caseless)
4756        while (start_match < end_subject && *start_match != first_byte)          while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4757          start_match++;            start_match++;
4758      }        else
4759            while (start_match < end_subject && *start_match != first_byte)
4760              start_match++;
4761          }
4762    
4763    /* Or to just after a linebreak for a multiline match if possible */      /* Or to just after a linebreak for a multiline match */
4764    
4765    else if (startline)      else if (startline)
     {  
     if (start_match > md->start_subject + start_offset)  
4766        {        {
4767        while (start_match <= end_subject && !WAS_NEWLINE(start_match))        if (start_match > md->start_subject + start_offset)
4768          start_match++;          {
4769    #ifdef SUPPORT_UTF8
4770            if (utf8)
4771              {
4772              while (start_match < end_subject && !WAS_NEWLINE(start_match))
4773                {
4774                start_match++;
4775                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4776                  start_match++;
4777                }
4778              }
4779            else
4780    #endif
4781            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4782              start_match++;
4783    
4784            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4785            and we are now at a LF, advance the match position by one more character.
4786            */
4787    
4788        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,          if (start_match[-1] == CHAR_CR &&
4789        and we are now at a LF, advance the match position by one more character.               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4790        */               start_match < end_subject &&
4791                 *start_match == CHAR_NL)
4792        if (start_match[-1] == '\r' &&            start_match++;
4793             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&          }
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4794        }        }
     }  
4795    
4796    /* Or to a non-unique first char after study */      /* Or to a non-unique first byte after study */
4797    
4798    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4799        {        {
4800        register unsigned int c = *start_match;        while (start_match < end_subject)
4801        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;          {
4802            register unsigned int c = *start_match;
4803            if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4804              else break;
4805            }
4806        }        }
4807      }      }   /* Starting optimizations */
4808    
4809    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4810    
# Line 4665  for(;;) Line 4816  for(;;)
4816    printf("\n");    printf("\n");
4817  #endif  #endif
4818    
4819    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4820    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4821    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4822    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4823    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4824    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4825    and backing off on a match.    autoincrement and backing off on a match.
4826    
4827    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4828    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4829    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4830    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4831      long.
4832    
4833    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4834    */    disabling is explicitly requested. */
4835    
4836    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4837          req_byte >= 0 &&
4838        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4839        !md->partial)        !md->partial)
4840      {      {
# Line 4789  for(;;) Line 4942  for(;;)
4942    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4943    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4944    
4945    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4946        start_match < end_subject &&        start_match < end_subject &&
4947        *start_match == '\n' &&        *start_match == CHAR_NL &&
4948        (re->options & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4949          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4950           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||
4951           md->nllen == 2))           md->nllen == 2))

Legend:
Removed from v.227  
changed lines
  Added in v.395

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12