/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 342 by ph10, Sun Apr 20 17:10:13 2008 UTC revision 345 by ph10, Mon Apr 28 15:10:02 2008 UTC
# Line 303  static const char error_texts[] = Line 303  static const char error_texts[] =
303    "number is too big\0"    "number is too big\0"
304    "subpattern name expected\0"    "subpattern name expected\0"
305    "digit expected after (?+\0"    "digit expected after (?+\0"
306    "] is an invalid data character in JavaScript compatibility mode";    "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 533  else Line 533  else
533      break;      break;
534    
535      /* \g must be followed by one of a number of specific things:      /* \g must be followed by one of a number of specific things:
536    
537      (1) A number, either plain or braced. If positive, it is an absolute      (1) A number, either plain or braced. If positive, it is an absolute
538      backreference. If negative, it is a relative backreference. This is a Perl      backreference. If negative, it is a relative backreference. This is a Perl
539      5.10 feature.      5.10 feature.
540    
541      (2) Perl 5.10 also supports \g{name} as a reference to a named group. This      (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542      is part of Perl's movement towards a unified syntax for back references. As      is part of Perl's movement towards a unified syntax for back references. As
543      this is synonymous with \k{name}, we fudge it up by pretending it really      this is synonymous with \k{name}, we fudge it up by pretending it really
544      was \k.      was \k.
545    
546      (3) For Oniguruma compatibility we also support \g followed by a name or a      (3) For Oniguruma compatibility we also support \g followed by a name or a
547      number either in angle brackets or in single quotes. However, these are      number either in angle brackets or in single quotes. However, these are
548      (possibly recursive) subroutine calls, _not_ backreferences. Just return      (possibly recursive) subroutine calls, _not_ backreferences. Just return
549      the -ESC_g code (cf \k). */      the -ESC_g code (cf \k). */
550    
551      case 'g':      case 'g':
552      if (ptr[1] == '<' || ptr[1] == '\'')      if (ptr[1] == '<' || ptr[1] == '\'')
553        {        {
554        c = -ESC_g;        c = -ESC_g;
555        break;        break;
556        }        }
557    
558      /* Handle the Perl-compatible cases */      /* Handle the Perl-compatible cases */
559    
560      if (ptr[1] == '{')      if (ptr[1] == '{')
561        {        {
562        const uschar *p;        const uschar *p;
# Line 588  else Line 588  else
588        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
589        break;        break;
590        }        }
591    
592      if (braced && *(++ptr) != '}')      if (braced && *(++ptr) != '}')
593        {        {
594        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
595        break;        break;
596        }        }
597    
598      if (c == 0)      if (c == 0)
599        {        {
600        *errorcodeptr = ERR58;        *errorcodeptr = ERR58;
601        break;        break;
602        }        }
603    
604      if (negated)      if (negated)
605        {        {
# Line 976  be terminated by '>' because that is che Line 976  be terminated by '>' because that is che
976    
977  Arguments:  Arguments:
978    ptr          current position in the pattern    ptr          current position in the pattern
979    cd           compile background data    cd           compile background data
980    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
981    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
982    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
# Line 1033  for (; *ptr != 0; ptr++) Line 1033  for (; *ptr != 0; ptr++)
1033    
1034      /* If the next character is ']', it is a data character that must be      /* If the next character is ']', it is a data character that must be
1035      skipped, except in JavaScript compatibility mode. */      skipped, except in JavaScript compatibility mode. */
1036    
1037      if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)      if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038        ptr++;        ptr++;
1039    
1040      while (*(++ptr) != ']')      while (*(++ptr) != ']')
1041        {        {
1042        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
# Line 1680  for (code = first_significant_code(code Line 1680  for (code = first_significant_code(code
1680      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1681      case OP_WORDCHAR:      case OP_WORDCHAR:
1682      case OP_ANY:      case OP_ANY:
1683      case OP_ALLANY:      case OP_ALLANY:
1684      case OP_ANYBYTE:      case OP_ANYBYTE:
1685      case OP_CHAR:      case OP_CHAR:
1686      case OP_CHARNC:      case OP_CHARNC:
# Line 1911  while ((ptr = (uschar *)find_recurse(ptr Line 1911  while ((ptr = (uschar *)find_recurse(ptr
1911    
1912    /* See if this recursion is on the forward reference list. If so, adjust the    /* See if this recursion is on the forward reference list. If so, adjust the
1913    reference. */    reference. */
1914    
1915    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1916      {      {
1917      offset = GET(hc, 0);      offset = GET(hc, 0);
# Line 2487  for (;; ptr++) Line 2487  for (;; ptr++)
2487    /* Get next byte in the pattern */    /* Get next byte in the pattern */
2488    
2489    c = *ptr;    c = *ptr;
2490    
2491    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
2492    previous cycle of this loop. */    previous cycle of this loop. */
2493    
# Line 2682  for (;; ptr++) Line 2682  for (;; ptr++)
2682      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2683      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2684      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2685    
2686      In JavaScript compatibility mode, an isolated ']' causes an error. In      In JavaScript compatibility mode, an isolated ']' causes an error. In
2687      default (Perl) mode, it is treated as a data character. */      default (Perl) mode, it is treated as a data character. */
2688    
2689      case ']':      case ']':
2690      if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)      if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2691        {        {
2692        *errorcodeptr = ERR64;        *errorcodeptr = ERR64;
2693        goto FAILED;        goto FAILED;
2694        }        }
2695      goto NORMAL_CHAR;      goto NORMAL_CHAR;
2696    
2697      case '[':      case '[':
2698      previous = code;      previous = code;
# Line 2725  for (;; ptr++) Line 2725  for (;; ptr++)
2725          negate_class = TRUE;          negate_class = TRUE;
2726        else break;        else break;
2727        }        }
2728    
2729      /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,      /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2730      an initial ']' is taken as a data character -- the code below handles      an initial ']' is taken as a data character -- the code below handles
2731      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2732      [^] must match any character, so generate OP_ALLANY. */      [^] must match any character, so generate OP_ALLANY. */
2733    
2734      if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)      if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2735        {        {
2736        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2737        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
2739        break;        break;
2740        }        }
2741    
2742      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2743      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
# Line 3902  we set the flag only if there is a liter Line 3902  we set the flag only if there is a liter
3902          **   code = previous;          **   code = previous;
3903          **   goto END_REPEAT;          **   goto END_REPEAT;
3904          **   }          **   }
3905    
3906          However, that fails when a group is referenced as a subroutine from          However, that fails when a group is referenced as a subroutine from
3907          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it          elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3908          so that it is skipped on execution. As we don't have a list of which          so that it is skipped on execution. As we don't have a list of which
3909          groups are referenced, we cannot do this selectively.          groups are referenced, we cannot do this selectively.
3910    
3911          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO          If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3912          and do no more at this point. However, we do need to adjust any          and do no more at this point. However, we do need to adjust any
# Line 3925  we set the flag only if there is a liter Line 3925  we set the flag only if there is a liter
3925              {              {
3926              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
3927              goto END_REPEAT;              goto END_REPEAT;
3928              }              }
3929            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3930            }            }
3931    
# Line 4119  we set the flag only if there is a liter Line 4119  we set the flag only if there is a liter
4119            }            }
4120          }          }
4121        }        }
4122    
4123      /* If previous is OP_FAIL, it was generated by an empty class [] in      /* If previous is OP_FAIL, it was generated by an empty class [] in
4124      JavaScript mode. The other ways in which OP_FAIL can be generated, that is      JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4125      by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"      by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4126      error above. We can just ignore the repeat in JS case. */      error above. We can just ignore the repeat in JS case. */
4127    
4128      else if (*previous == OP_FAIL) goto END_REPEAT;      else if (*previous == OP_FAIL) goto END_REPEAT;
4129    
4130      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4131    
# Line 4207  we set the flag only if there is a liter Line 4207  we set the flag only if there is a liter
4207      bravalue = OP_CBRA;      bravalue = OP_CBRA;
4208      save_hwm = cd->hwm;      save_hwm = cd->hwm;
4209      reset_bracount = FALSE;      reset_bracount = FALSE;
4210    
4211      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4212    
4213      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
# Line 4738  we set the flag only if there is a liter Line 4738  we set the flag only if there is a liter
4738            {            {
4739            const uschar *called;            const uschar *called;
4740            terminator = ')';            terminator = ')';
4741    
4742            /* Come here from the \g<...> and \g'...' code (Oniguruma            /* Come here from the \g<...> and \g'...' code (Oniguruma
4743            compatibility). However, the syntax has been checked to ensure that            compatibility). However, the syntax has been checked to ensure that
4744            the ... are a (signed) number, so that neither ERR63 nor ERR29 will            the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4745            be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY            be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4746            ever be taken. */            ever be taken. */
4747    
4748            HANDLE_NUMERICAL_RECURSION:            HANDLE_NUMERICAL_RECURSION:
4749    
4750            if ((refsign = *ptr) == '+')            if ((refsign = *ptr) == '+')
4751              {              {
# Line 5163  we set the flag only if there is a liter Line 5163  we set the flag only if there is a liter
5163      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
5164      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5165      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5166    
5167      case '\\':      case '\\':
5168      tempptr = ptr;      tempptr = ptr;
5169      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
# Line 5190  we set the flag only if there is a liter Line 5190  we set the flag only if there is a liter
5190    
5191        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5192        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5193    
5194        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'        /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5195        is a subroutine call by number (Oniguruma syntax). In fact, the value        is a subroutine call by number (Oniguruma syntax). In fact, the value
5196        -ESC_g is returned only for these cases. So we don't need to check for <        -ESC_g is returned only for these cases. So we don't need to check for <
5197        or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is        or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5198        -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as        -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5199        that is a synonym for a named back reference). */        that is a synonym for a named back reference). */
5200    
5201        if (-c == ESC_g)        if (-c == ESC_g)
5202          {          {
5203          const uschar *p;          const uschar *p;
5204          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5205          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : '\'';
5206    
5207          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
5208          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In          unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5209          fact, because we actually check for a number below, the paths that          fact, because we actually check for a number below, the paths that
5210          would actually be in error are never taken. */          would actually be in error are never taken. */
5211    
5212          skipbytes = 0;          skipbytes = 0;
5213          reset_bracount = FALSE;          reset_bracount = FALSE;
5214    
5215          /* Test for a name */          /* Test for a name */
5216    
5217          if (ptr[1] != '+' && ptr[1] != '-')          if (ptr[1] != '+' && ptr[1] != '-')
5218            {            {
5219            BOOL isnumber = TRUE;            BOOL isnumber = TRUE;
5220            for (p = ptr + 1; *p != 0 && *p != terminator; p++)            for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5221              {              {
5222              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;              if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5223              if ((cd->ctypes[*p] & ctype_word) == 0) break;              if ((cd->ctypes[*p] & ctype_word) == 0) break;
5224              }              }
5225            if (*p != terminator)            if (*p != terminator)
5226              {              {
5227              *errorcodeptr = ERR57;              *errorcodeptr = ERR57;
5228              break;              break;
5229              }              }
5230            if (isnumber)            if (isnumber)
5231              {              {
5232              ptr++;              ptr++;
5233              goto HANDLE_NUMERICAL_RECURSION;              goto HANDLE_NUMERICAL_RECURSION;
5234              }              }
5235            is_recurse = TRUE;            is_recurse = TRUE;
5236            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
5237            }            }
5238    
5239          /* Test a signed number in angle brackets or quotes. */          /* Test a signed number in angle brackets or quotes. */
5240    
5241          p = ptr + 2;          p = ptr + 2;
5242          while ((digitab[*p] & ctype_digit) != 0) p++;          while ((digitab[*p] & ctype_digit) != 0) p++;
5243          if (*p != terminator)          if (*p != terminator)
# Line 5245  we set the flag only if there is a liter Line 5245  we set the flag only if there is a liter
5245            *errorcodeptr = ERR57;            *errorcodeptr = ERR57;
5246            break;            break;
5247            }            }
5248          ptr++;          ptr++;
5249          goto HANDLE_NUMERICAL_RECURSION;          goto HANDLE_NUMERICAL_RECURSION;
5250          }          }
5251    
5252        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5253        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
# Line 5761  do { Line 5761  do {
5761     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5762               op == OP_TYPEPOSSTAR))               op == OP_TYPEPOSSTAR))
5763       {       {
5764       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5765         return FALSE;         return FALSE;
5766       }       }
5767    
# Line 6267  while (errorcode == 0 && cd->hwm > cwork Line 6267  while (errorcode == 0 && cd->hwm > cwork
6267    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6268      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6269    }    }
6270    
6271  /* Give an error if there's back reference to a non-existent capturing  /* Give an error if there's back reference to a non-existent capturing
6272  subpattern. */  subpattern. */
6273    

Legend:
Removed from v.342  
changed lines
  Added in v.345

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12