/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 604 by ph10, Thu Jun 2 19:04:54 2011 UTC revision 635 by ph10, Sat Jul 23 16:19:50 2011 UTC
# Line 409  static const char error_texts[] = Line 409  static const char error_texts[] =
409    "(*MARK) must have an argument\0"    "(*MARK) must have an argument\0"
410    "this version of PCRE is not compiled with PCRE_UCP support\0"    "this version of PCRE is not compiled with PCRE_UCP support\0"
411    "\\c must be followed by an ASCII character\0"    "\\c must be followed by an ASCII character\0"
412      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413    ;    ;
414    
415  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1694  _pcre_find_bracket(const uschar *code, B Line 1695  _pcre_find_bracket(const uschar *code, B
1695  for (;;)  for (;;)
1696    {    {
1697    register int c = *code;    register int c = *code;
1698    
1699    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1700    
1701    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1974  for (code = first_significant_code(code Line 1976  for (code = first_significant_code(code
1976      }      }
1977    
1978    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1979    implies a subroutine call, we can scan it. */    implies a backward reference subroutine call, we can scan it. If it's a
1980      forward reference subroutine call, we can't. To detect forward reference
1981      we have to scan up the list that is kept in the workspace. This function is
1982      called only when doing the real compile, not during the pre-compile that
1983      measures the size of the compiled pattern. */
1984    
1985    if (c == OP_RECURSE)    if (c == OP_RECURSE)
1986      {      {
1987      BOOL empty_branch = FALSE;      const uschar *scode;
1988      const uschar *scode = cd->start_code + GET(code, 1);      BOOL empty_branch;
1989    
1990        /* Test for forward reference */
1991    
1992        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1993          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1994    
1995        /* Not a forward reference, test for completed backward reference */
1996    
1997        empty_branch = FALSE;
1998        scode = cd->start_code + GET(code, 1);
1999      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2000    
2001        /* Completed backwards reference */
2002    
2003      do      do
2004        {        {
2005        if (could_be_empty_branch(scode, endcode, utf8, cd))        if (could_be_empty_branch(scode, endcode, utf8, cd))
# Line 1991  for (code = first_significant_code(code Line 2010  for (code = first_significant_code(code
2010        scode += GET(scode, 1);        scode += GET(scode, 1);
2011        }        }
2012      while (*scode == OP_ALT);      while (*scode == OP_ALT);
2013    
2014      if (!empty_branch) return FALSE;  /* All branches are non-empty */      if (!empty_branch) return FALSE;  /* All branches are non-empty */
2015      continue;      continue;
2016      }      }
# Line 2216  return TRUE; Line 2236  return TRUE;
2236  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2237  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2238  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2239    This function is called only during the real compile, not during the
2240    pre-compile.
2241    
2242  Arguments:  Arguments:
2243    code        points to start of the recursion    code        points to start of the recursion
# Line 3017  int greedy_default, greedy_non_default; Line 3039  int greedy_default, greedy_non_default;
3039  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3040  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3041  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3042  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3043  int after_manual_callout = 0;  int after_manual_callout = 0;
3044  int length_prevgroup = 0;  int length_prevgroup = 0;
3045  register int c;  register int c;
# Line 3035  uschar *previous_callout = NULL; Line 3057  uschar *previous_callout = NULL;
3057  uschar *save_hwm = NULL;  uschar *save_hwm = NULL;
3058  uschar classbits[32];  uschar classbits[32];
3059    
3060    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3061    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3062    dynamically as we process the pattern. */
3063    
3064  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3065  BOOL class_utf8;  BOOL class_utf8;
3066  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
# Line 3215  for (;; ptr++) Line 3241  for (;; ptr++)
3241      previous_callout = NULL;      previous_callout = NULL;
3242      }      }
3243    
3244    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3245    
3246    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3247      {      {
# Line 4207  for (;; ptr++) Line 4233  for (;; ptr++)
4233        ptr++;        ptr++;
4234        }        }
4235      else repeat_type = greedy_default;      else repeat_type = greedy_default;
4236    
4237        /* If previous was a recursion call, wrap it in atomic brackets so that
4238        previous becomes the atomic group. All recursions were so wrapped in the
4239        past, but it no longer happens for non-repeated recursions. In fact, the
4240        repeated ones could be re-implemented independently so as not to need this,
4241        but for the moment we rely on the code for repeating groups. */
4242    
4243        if (*previous == OP_RECURSE)
4244          {
4245          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
4246          *previous = OP_ONCE;
4247          PUT(previous, 1, 2 + 2*LINK_SIZE);
4248          previous[2 + 2*LINK_SIZE] = OP_KET;
4249          PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4250          code += 2 + 2 * LINK_SIZE;
4251          length_prevgroup = 3 + 3*LINK_SIZE;
4252    
4253          /* When actually compiling, we need to check whether this was a forward
4254          reference, and if so, adjust the offset. */
4255    
4256          if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4257            {
4258            int offset = GET(cd->hwm, -LINK_SIZE);
4259            if (offset == previous + 1 - cd->start_code)
4260              PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4261            }
4262          }
4263    
4264        /* Now handle repetition for the different types of item. */
4265    
4266      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
4267      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
# Line 4510  for (;; ptr++) Line 4565  for (;; ptr++)
4565        int len = (int)(code - previous);        int len = (int)(code - previous);
4566        uschar *bralink = NULL;        uschar *bralink = NULL;
4567        uschar *brazeroptr = NULL;        uschar *brazeroptr = NULL;
4568    
4569        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless */
4570    
4571        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
# Line 4726  for (;; ptr++) Line 4781  for (;; ptr++)
4781          }          }
4782    
4783        /* If the maximum is unlimited, set a repeater in the final copy. For        /* If the maximum is unlimited, set a repeater in the final copy. For
4784        ONCE brackets, that's all we need to do.        ONCE brackets, that's all we need to do. However, possessively repeated
4785          ONCE brackets can be converted into non-capturing brackets, as the
4786          behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4787          deal with possessive ONCEs specially.
4788    
4789        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, if the quantifier was possessive, we convert the BRA code to
4790        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4791        at runtime to detect this kind of subpattern at both the start and at the        at runtime to detect this kind of subpattern at both the start and at the
4792        end.) If the group is preceded by OP_BRAZERO, convert this to        end.) The use of special opcodes makes it possible to reduce greatly the
4793        OP_BRAPOSZERO. Then cancel the possessive flag so that the default action        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,
4794        below, of wrapping everything inside atomic brackets, does not happen.        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that
4795          the default action below, of wrapping everything inside atomic brackets,
4796          does not happen.
4797    
4798        Then, when we are doing the actual compile phase, check to see whether        Then, when we are doing the actual compile phase, check to see whether
4799        this group is one that could match an empty string. If so, convert the        this group is one that could match an empty string. If so, convert the
# Line 4745  for (;; ptr++) Line 4805  for (;; ptr++)
4805          {          {
4806          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
4807          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4808    
4809          if (*bracode == OP_ONCE)          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
4810            if (*bracode == OP_ONCE)
4811            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
4812          else          else
4813            {            {
# Line 4793  for (;; ptr++) Line 4854  for (;; ptr++)
4854        }        }
4855    
4856      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
4857      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
4858      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
4859      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
4860      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
4861      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
4862      tempcode, not at previous, which might be the first part of a string whose  
4863      (former) last char we repeated.      Possessively repeated subpatterns have already been handled in the code
4864        just above, so possessive_quantifier is always FALSE for them at this
4865        stage.
4866    
4867        Note that the repeated item starts at tempcode, not at previous, which
4868        might be the first part of a string whose (former) last char we repeated.
4869    
4870      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4871      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 4924  for (;; ptr++) Line 4990  for (;; ptr++)
4990          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4991              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4992            {            {
4993            /* Check for open captures before ACCEPT */            /* Check for open captures before ACCEPT and convert it to
4994              ASSERT_ACCEPT if in an assertion. */
4995    
4996            if (verbs[i].op == OP_ACCEPT)            if (verbs[i].op == OP_ACCEPT)
4997              {              {
4998              open_capitem *oc;              open_capitem *oc;
4999                if (arglen != 0)
5000                  {
5001                  *errorcodeptr = ERR59;
5002                  goto FAILED;
5003                  }
5004              cd->had_accept = TRUE;              cd->had_accept = TRUE;
5005              for (oc = cd->open_caps; oc != NULL; oc = oc->next)              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5006                {                {
5007                *code++ = OP_CLOSE;                *code++ = OP_CLOSE;
5008                PUT2INC(code, 0, oc->number);                PUT2INC(code, 0, oc->number);
5009                }                }
5010                *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5011              }              }
5012    
5013            /* Handle the cases with/without an argument */            /* Handle other cases with/without an argument */
5014    
5015            if (arglen == 0)            else if (arglen == 0)
5016              {              {
5017              if (verbs[i].op < 0)   /* Argument is mandatory */              if (verbs[i].op < 0)   /* Argument is mandatory */
5018                {                {
# Line 5228  for (;; ptr++) Line 5301  for (;; ptr++)
5301          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5302          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5303          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
5304            cd->assert_depth += 1;
5305          ptr++;          ptr++;
5306          break;          break;
5307    
# Line 5242  for (;; ptr++) Line 5316  for (;; ptr++)
5316            continue;            continue;
5317            }            }
5318          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
5319            cd->assert_depth += 1;
5320          break;          break;
5321    
5322    
# Line 5251  for (;; ptr++) Line 5326  for (;; ptr++)
5326            {            {
5327            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5328            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
5329              cd->assert_depth += 1;
5330            ptr += 2;            ptr += 2;
5331            break;            break;
5332    
5333            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5334            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
5335              cd->assert_depth += 1;
5336            ptr += 2;            ptr += 2;
5337            break;            break;
5338    
# Line 5499  for (;; ptr++) Line 5576  for (;; ptr++)
5576    
5577            temp = cd->end_pattern;            temp = cd->end_pattern;
5578            cd->end_pattern = ptr;            cd->end_pattern = ptr;
5579            recno = find_parens(cd, name, namelen,            recno = find_parens(cd, name, namelen,
5580              (options & PCRE_EXTENDED) != 0, utf8);              (options & PCRE_EXTENDED) != 0, utf8);
5581            cd->end_pattern = temp;            cd->end_pattern = temp;
5582            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */            if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
# Line 5646  for (;; ptr++) Line 5723  for (;; ptr++)
5723    
5724                /* Fudge the value of "called" so that when it is inserted as an                /* Fudge the value of "called" so that when it is inserted as an
5725                offset below, what it actually inserted is the reference number                offset below, what it actually inserted is the reference number
5726                of the group. */                of the group. Then remember the forward reference. */
5727    
5728                called = cd->start_code + recno;                called = cd->start_code + recno;
5729                PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
5730                }                }
5731    
5732              /* If not a forward reference, and the subpattern is still open,              /* If not a forward reference, and the subpattern is still open,
# Line 5664  for (;; ptr++) Line 5741  for (;; ptr++)
5741                }                }
5742              }              }
5743    
5744            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item. */
5745            "once" brackets. Set up a "previous group" length so that a  
           subsequent quantifier will work. */  
   
           *code = OP_ONCE;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
5746            *code = OP_RECURSE;            *code = OP_RECURSE;
5747            PUT(code, 1, (int)(called - cd->start_code));            PUT(code, 1, (int)(called - cd->start_code));
5748            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
   
           *code = OP_KET;  
           PUT(code, 1, 2 + 2*LINK_SIZE);  
           code += 1 + LINK_SIZE;  
   
           length_prevgroup = 3 + 3*LINK_SIZE;  
5749            }            }
5750    
5751          /* Can't determine a first byte now */          /* Can't determine a first byte now */
# Line 5823  for (;; ptr++) Line 5888  for (;; ptr++)
5888             &length_prevgroup           /* Pre-compile phase */             &length_prevgroup           /* Pre-compile phase */
5889           ))           ))
5890        goto FAILED;        goto FAILED;
5891    
5892        if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
5893          cd->assert_depth -= 1;
5894    
5895      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
5896      group, while tempcode has been updated to point past the end of the group      group, while tempcode has been updated to point past the end of the group
# Line 5894  for (;; ptr++) Line 5962  for (;; ptr++)
5962          goto FAILED;          goto FAILED;
5963          }          }
5964        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5965        *code++ = OP_BRA;        code++;   /* This already contains bravalue */
5966        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
5967        *code++ = OP_KET;        *code++ = OP_KET;
5968        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 6062  for (;; ptr++) Line 6130  for (;; ptr++)
6130          }          }
6131    
6132        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6133        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax).  */
6134    
6135        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k)
           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))  
6136          {          {
6137            if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6138              ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6139              {
6140              *errorcodeptr = ERR69;
6141              break;
6142              }
6143          is_recurse = FALSE;          is_recurse = FALSE;
6144          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6145            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6146            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6147          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
6148          }          }
6149    
6150        /* Back references are handled specially; must disable firstbyte if        /* Back references are handled specially; must disable firstbyte if
6151        not set to cope with cases like (?=(\w+))\1: which would otherwise set        not set to cope with cases like (?=(\w+))\1: which would otherwise set
# Line 6969  utf8 = (options & PCRE_UTF8) != 0; Line 7042  utf8 = (options & PCRE_UTF8) != 0;
7042    
7043  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
7044  return of an error code from _pcre_valid_utf8() is a new feature, introduced in  return of an error code from _pcre_valid_utf8() is a new feature, introduced in
7045  release 8.13. The only use we make of it here is to adjust the offset value to  release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7046  the end of the string for a short string error, for compatibility with previous  not used here. */
 versions. */  
7047    
7048  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
7049  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7050       (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0)       (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
7051    {    {
7052    errorcode = ERR44;    errorcode = ERR44;
7053    goto PCRE_EARLY_ERROR_RETURN2;    goto PCRE_EARLY_ERROR_RETURN2;
# Line 7146  field; this time it's used for rememberi Line 7218  field; this time it's used for rememberi
7218  */  */
7219    
7220  cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7221    cd->assert_depth = 0;
7222  cd->bracount = 0;  cd->bracount = 0;
7223  cd->names_found = 0;  cd->names_found = 0;
7224  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;

Legend:
Removed from v.604  
changed lines
  Added in v.635

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12