/[pcre]/code/tags/pcre-4.1/pcre.c
ViewVC logotype

Diff of /code/tags/pcre-4.1/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 63 by nigel, Sat Feb 24 21:40:03 2007 UTC revision 65 by nigel, Sat Feb 24 21:40:08 2007 UTC
# Line 68  compile time. */ Line 68  compile time. */
68  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
69    
70    
   
71  /* Maximum number of ints of offset to save on the stack for recursive calls.  /* Maximum number of ints of offset to save on the stack for recursive calls.
72  If the offset vector is bigger, malloc is used. This should be a multiple of 3,  If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73  because the offset vector is always a multiple of 3 long. */  because the offset vector is always a multiple of 3 long. */
# Line 84  test output would be different, which ju Line 83  test output would be different, which ju
83  #define MAXLIT 250  #define MAXLIT 250
84    
85    
86    /* The maximum remaining length of subject we are prepared to search for a
87    req_byte match. */
88    
89    #define REQ_BYTE_MAX 1000
90    
91    
92  /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that  /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93  the definition is next to the definition of the opcodes in internal.h. */  the definition is next to the definition of the opcodes in internal.h. */
94    
# Line 1138  Returns: pointer to the opcode for Line 1143  Returns: pointer to the opcode for
1143  static const uschar *  static const uschar *
1144  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1145  {  {
1146    #ifndef SUPPORT_UTF8
1147    utf8 = utf8;               /* Stop pedantic compilers complaining */
1148    #endif
1149    
1150  for (;;)  for (;;)
1151    {    {
1152    register int c = *code;    register int c = *code;
# Line 1453  int length; Line 1462  int length;
1462  int greedy_default, greedy_non_default;  int greedy_default, greedy_non_default;
1463  int firstbyte, reqbyte;  int firstbyte, reqbyte;
1464  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
1465  int req_caseopt;  int req_caseopt, reqvary, tempreqvary;
1466  int condcount = 0;  int condcount = 0;
1467  int options = *optionsptr;  int options = *optionsptr;
1468  register int c;  register int c;
# Line 1699  for (;; ptr++) Line 1708  for (;; ptr++)
1708          posix_class *= 3;          posix_class *= 3;
1709          for (i = 0; i < 3; i++)          for (i = 0; i < 3; i++)
1710            {            {
1711            BOOL isblank = strncmp(ptr, "blank", 5) == 0;            BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1712            int taboffset = posix_class_maps[posix_class + i];            int taboffset = posix_class_maps[posix_class + i];
1713            if (taboffset < 0) break;            if (taboffset < 0) break;
1714            if (local_negate)            if (local_negate)
# Line 1949  for (;; ptr++) Line 1958  for (;; ptr++)
1958          else          else
1959            {            {
1960            zerofirstbyte = firstbyte;            zerofirstbyte = firstbyte;
1961            reqbyte = class_lastchar | req_caseopt;            reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
1962            }            }
1963          *code++ = OP_CHARS;          *code++ = OP_CHARS;
1964          *code++ = 1;          *code++ = 1;
# Line 2053  for (;; ptr++) Line 2062  for (;; ptr++)
2062    
2063      if (repeat_min == 0)      if (repeat_min == 0)
2064        {        {
2065        firstbyte = zerofirstbyte;   /* Adjust for zero repeat */        firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2066        reqbyte = zeroreqbyte;       /* Ditto */        reqbyte = zeroreqbyte;        /* Ditto */
2067        }        }
2068    
2069        /* Remember whether this is a variable length repeat */
2070    
2071        reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2072    
2073      op_type = 0;                    /* Default single-char op codes */      op_type = 0;                    /* Default single-char op codes */
2074      possessive_quantifier = FALSE;  /* Default not possessive quantifier */      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2075    
# Line 2142  for (;; ptr++) Line 2155  for (;; ptr++)
2155          if (code == previous + 2)   /* There was only one character */          if (code == previous + 2)   /* There was only one character */
2156            {            {
2157            code = previous;              /* Abolish the previous item */            code = previous;              /* Abolish the previous item */
2158            if (repeat_min > 1) reqbyte = c | req_caseopt;            if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2159            }            }
2160          else          else
2161            {            {
# Line 2501  for (;; ptr++) Line 2514  for (;; ptr++)
2514        PUT(tempcode, 1, len);        PUT(tempcode, 1, len);
2515        }        }
2516    
2517      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. We also set the
2518        "follows varying string" flag for subsequently encountered reqbytes if
2519        it isn't already set and we have just passed a varying length item. */
2520    
2521      END_REPEAT:      END_REPEAT:
2522      previous = NULL;      previous = NULL;
2523        cd->req_varyopt |= reqvary;
2524      break;      break;
2525    
2526    
# Line 2553  for (;; ptr++) Line 2569  for (;; ptr++)
2569    
2570          else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)          else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2571            {            {
2572            int condref = *(++ptr) - '0';            int condref;                 /* Don't amalgamate; some compilers */
2573              condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2574            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2575            if (condref == 0)            if (condref == 0)
2576              {              {
# Line 2619  for (;; ptr++) Line 2636  for (;; ptr++)
2636          if (*(++ptr) == '<')      /* Definition */          if (*(++ptr) == '<')      /* Definition */
2637            {            {
2638            int i, namelen;            int i, namelen;
           const uschar *name = ++ptr;  
2639            uschar *slot = cd->name_table;            uschar *slot = cd->name_table;
2640              const uschar *name;     /* Don't amalgamate; some compilers */
2641              name = ++ptr;           /* grumble at autoincrement in declaration */
2642    
2643            while (*ptr++ != '>');            while (*ptr++ != '>');
2644            namelen = ptr - name - 1;            namelen = ptr - name - 1;
2645    
2646            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
2647              {              {
2648              int c = memcmp(name, slot+2, namelen + 1);              int c = memcmp(name, slot+2, namelen);
2649              if (c == 0)              if (c == 0)
2650                {                {
2651                *errorptr = ERR43;                if (slot[2+namelen] == 0)
2652                goto FAILED;                  {
2653                    *errorptr = ERR43;
2654                    goto FAILED;
2655                    }
2656                  c = -1;             /* Current name is substring */
2657                }                }
2658              if (c < 0)              if (c < 0)
2659                {                {
# Line 2661  for (;; ptr++) Line 2683  for (;; ptr++)
2683    
2684            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
2685              {              {
2686              if (strncmp(name, slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2687              slot += cd->name_entry_size;              slot += cd->name_entry_size;
2688              }              }
2689            if (i >= cd->names_found)            if (i >= cd->names_found)
# Line 2839  for (;; ptr++) Line 2861  for (;; ptr++)
2861      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
2862      *code = bravalue;      *code = bravalue;
2863      tempcode = code;      tempcode = code;
2864        tempreqvary = cd->req_varyopt;     /* Save value before bracket */
2865    
2866      if (!compile_regex(      if (!compile_regex(
2867           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
# Line 2917  for (;; ptr++) Line 2940  for (;; ptr++)
2940          }          }
2941    
2942        /* If firstbyte was previously set, convert the subpattern's firstbyte        /* If firstbyte was previously set, convert the subpattern's firstbyte
2943        into reqbyte if there wasn't one. */        into reqbyte if there wasn't one, using the vary flag that was in
2944          existence beforehand. */
2945    
2946        else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte;        else if (subfirstbyte >= 0 && subreqbyte < 0)
2947            subreqbyte = subfirstbyte | tempreqvary;
2948    
2949        /* If the subpattern set a required char (or set a first char that isn't        /* If the subpattern set a required byte (or set a first byte that isn't
2950        really the first char - see above), set it. */        really the first byte - see above), set it. */
2951    
2952        if (subreqbyte >= 0) reqbyte = subreqbyte;        if (subreqbyte >= 0) reqbyte = subreqbyte;
2953        }        }
# Line 3140  for (;; ptr++) Line 3165  for (;; ptr++)
3165          if (firstbyte == REQ_UNSET)          if (firstbyte == REQ_UNSET)
3166            {            {
3167            zerofirstbyte = firstbyte = previous[2] | req_caseopt;            zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3168            zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt;            zeroreqbyte = (t - 1 == previous + 2)?
3169                reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3170            }            }
3171    
3172          /* If there was a previous first byte, leave it alone, and don't change          /* If there was a previous first byte, leave it alone, and don't change
# Line 3150  for (;; ptr++) Line 3176  for (;; ptr++)
3176          else          else
3177            {            {
3178            zerofirstbyte = firstbyte;            zerofirstbyte = firstbyte;
3179            zeroreqbyte = t[-1] | req_caseopt;            zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3180            }            }
3181          }          }
3182    
3183        /* In all cases (we know length > 1), the new required byte is the last        /* In all cases (we know length > 1), the new required byte is the last
3184        byte of the string. */        byte of the string. */
3185    
3186        reqbyte = code[-1] | req_caseopt;        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3187        }        }
3188    
3189      else   /* End of UTF-8 coding */      else   /* End of UTF-8 coding */
# Line 3180  for (;; ptr++) Line 3206  for (;; ptr++)
3206          else          else
3207            {            {
3208            zerofirstbyte = firstbyte = previous[2] | req_caseopt;            zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3209            zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte;            zeroreqbyte = (length > 2)?
3210            reqbyte = code[-1] | req_caseopt;              (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3211              reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3212            }            }
3213          }          }
3214    
# Line 3190  for (;; ptr++) Line 3217  for (;; ptr++)
3217        else        else
3218          {          {
3219          zerofirstbyte = firstbyte;          zerofirstbyte = firstbyte;
3220          zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt;          zeroreqbyte = (length == 1)? reqbyte :
3221          reqbyte = code[-1] | req_caseopt;            code[-2] | req_caseopt | cd->req_varyopt;
3222            reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3223          }          }
3224        }        }
3225    
# Line 3308  for (;;) Line 3336  for (;;)
3336      }      }
3337    
3338    /* If this is not the first branch, the first char and reqbyte have to    /* If this is not the first branch, the first char and reqbyte have to
3339    match the values from all the previous branches. */    match the values from all the previous branches, except that if the previous
3340      value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3341      REQ_VARY for the regex. */
3342    
3343    else    else
3344      {      {
# Line 3330  for (;;) Line 3360  for (;;)
3360    
3361      /* Now ensure that the reqbytes match */      /* Now ensure that the reqbytes match */
3362    
3363      if (reqbyte != branchreqbyte) reqbyte = REQ_NONE;      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3364          reqbyte = REQ_NONE;
3365        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3366      }      }
3367    
3368    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
# Line 4168  while ((c = *(++ptr)) != 0) Line 4200  while ((c = *(++ptr)) != 0)
4200          ptr += 3;          ptr += 3;
4201          if (*ptr == '<')          if (*ptr == '<')
4202            {            {
4203            const uschar *p = ++ptr;            const uschar *p;    /* Don't amalgamate; some compilers */
4204              p = ++ptr;          /* grumble at autoincrement in declaration */
4205            while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;            while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4206            if (*ptr != '>')            if (*ptr != '>')
4207              {              {
# Line 4599  compile_block.name_entry_size = max_name Line 4632  compile_block.name_entry_size = max_name
4632  compile_block.name_table = (uschar *)re + sizeof(real_pcre);  compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4633  codestart = compile_block.name_table + re->name_entry_size * re->name_count;  codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4634  compile_block.start_code = codestart;  compile_block.start_code = codestart;
4635    compile_block.req_varyopt = 0;
4636    
4637  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
4638  error, *errorptr will be set non-NULL, so we don't need to look at the result  error, *errorptr will be set non-NULL, so we don't need to look at the result
# Line 4672  if ((options & PCRE_ANCHORED) == 0) Line 4706  if ((options & PCRE_ANCHORED) == 0)
4706      }      }
4707    }    }
4708    
4709  /* Save the last required character if any. Remove caseless flag for  /* For an anchored pattern, we use the "required byte" only if it follows a
4710  non-caseable chars. */  variable length item in the regex. Remove the caseless flag for non-caseable
4711    chars. */
4712    
4713  if ((re->options & PCRE_ANCHORED) != 0 && reqbyte < 0 && firstbyte >= 0)  if (reqbyte >= 0 &&
4714    reqbyte = firstbyte;       ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
   
 if (reqbyte >= 0)  
4715    {    {
4716    int ch = reqbyte & 255;    int ch = reqbyte & 255;
4717    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
# Line 5263  for (;;) Line 5296  for (;;)
5296          (pcre_free)(new_recursive.offset_save);          (pcre_free)(new_recursive.offset_save);
5297        return MATCH_NOMATCH;        return MATCH_NOMATCH;
5298        }        }
5299      break;      /* Control never reaches here */
5300    
5301      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
5302      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 7370  do Line 7403  do
7403    optimization can save a huge amount of backtracking in patterns with nested    optimization can save a huge amount of backtracking in patterns with nested
7404    unlimited repeats that aren't going to match. Writing separate code for    unlimited repeats that aren't going to match. Writing separate code for
7405    cased/caseless versions makes it go faster, as does using an autoincrement    cased/caseless versions makes it go faster, as does using an autoincrement
7406    and backing off on a match. */    and backing off on a match.
7407    
7408      HOWEVER: when the subject string is very, very long, searching to its end can
7409      take a long time, and give bad performance on quite ordinary patterns. This
7410      showed up when somebody was matching /^C/ on a 32-megabyte string... so we
7411      don't do this when the string is sufficiently long. */
7412    
7413    if (req_byte >= 0)    if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
7414      {      {
7415      register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);      register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
7416    

Legend:
Removed from v.63  
changed lines
  Added in v.65

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12