/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 282 by ph10, Fri Dec 7 19:32:32 2007 UTC revision 305 by ph10, Sun Jan 20 20:07:32 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 1531  for (code = first_significant_code(code Line 1531  for (code = first_significant_code(code
1531    const uschar *ccode;    const uschar *ccode;
1532    
1533    c = *code;    c = *code;
1534    
1535    /* Skip over forward assertions; the other assertions are skipped by    /* Skip over forward assertions; the other assertions are skipped by
1536    first_significant_code() with a TRUE final argument. */    first_significant_code() with a TRUE final argument. */
1537    
1538    if (c == OP_ASSERT)    if (c == OP_ASSERT)
1539      {      {
1540      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
1541      c = *code;      c = *code;
1542      continue;      continue;
1543      }      }
1544    
1545    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1546    
# Line 1737  return TRUE; Line 1737  return TRUE;
1737  *************************************************/  *************************************************/
1738    
1739  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1740  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1741  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1743    
1744    Originally, this function only recognized a sequence of letters between the
1745    terminators, but it seems that Perl recognizes any sequence of characters,
1746    though of course unknown POSIX names are subsequently rejected. Perl gives an
1747    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748    didn't consider this to be a POSIX class. Likewise for [:1234:].
1749    
1750    The problem in trying to be exactly like Perl is in the handling of escapes. We
1751    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753    below handles the special case of \], but does not try to do any other escape
1754    processing. This makes it different from Perl for cases such as [:l\ower:]
1755    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757    I think.
1758    
1759  Argument:  Arguments:
1760    ptr      pointer to the initial [    ptr      pointer to the initial [
1761    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1762    
1763  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1764  */  */
1765    
1766  static BOOL  static BOOL
1767  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768  {  {
1769  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1770  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1771  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1772    {    {
1773    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774    return TRUE;      {
1775        if (*ptr == ']') return FALSE;
1776        if (*ptr == terminator && ptr[1] == ']')
1777          {
1778          *endptr = ptr;
1779          return TRUE;
1780          }
1781        }
1782    }    }
1783  return FALSE;  return FALSE;
1784  }  }
# Line 2357  uschar classbits[32]; Line 2376  uschar classbits[32];
2376  BOOL class_utf8;  BOOL class_utf8;
2377  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2378  uschar *class_utf8data;  uschar *class_utf8data;
2379    uschar *class_utf8data_base;
2380  uschar utf8_char[6];  uschar utf8_char[6];
2381  #else  #else
2382  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
# Line 2396  req_caseopt = ((options & PCRE_CASELESS) Line 2416  req_caseopt = ((options & PCRE_CASELESS)
2416  for (;; ptr++)  for (;; ptr++)
2417    {    {
2418    BOOL negate_class;    BOOL negate_class;
2419    BOOL should_flip_negation;    BOOL should_flip_negation;
2420    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2421    BOOL is_quantifier;    BOOL is_quantifier;
2422    BOOL is_recurse;    BOOL is_recurse;
# Line 2620  for (;; ptr++) Line 2640  for (;; ptr++)
2640      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2641    
2642      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2643          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2644        {        {
2645        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2646        goto FAILED;        goto FAILED;
# Line 2645  for (;; ptr++) Line 2665  for (;; ptr++)
2665        else break;        else break;
2666        }        }
2667    
2668      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2669      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
2670      correctly (they are all included in the class). */      correctly (they are all included in the class). */
2671    
2672      should_flip_negation = FALSE;      should_flip_negation = FALSE;
# Line 2668  for (;; ptr++) Line 2688  for (;; ptr++)
2688  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2689      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2690      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2691        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2692  #endif  #endif
2693    
2694      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2683  for (;; ptr++) Line 2704  for (;; ptr++)
2704          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2705          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2706          }          }
2707    
2708          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2709          data and reset the pointer. This is so that very large classes that
2710          contain a zillion UTF-8 characters no longer overwrite the work space
2711          (which is on the stack). */
2712    
2713          if (lengthptr != NULL)
2714            {
2715            *lengthptr += class_utf8data - class_utf8data_base;
2716            class_utf8data = class_utf8data_base;
2717            }
2718    
2719  #endif  #endif
2720    
2721        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
# Line 2706  for (;; ptr++) Line 2739  for (;; ptr++)
2739    
2740        if (c == '[' &&        if (c == '[' &&
2741            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2742            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2743          {          {
2744          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2745          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 2723  for (;; ptr++) Line 2756  for (;; ptr++)
2756          if (*ptr == '^')          if (*ptr == '^')
2757            {            {
2758            local_negate = TRUE;            local_negate = TRUE;
2759            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
2760            ptr++;            ptr++;
2761            }            }
2762    
# Line 2826  for (;; ptr++) Line 2859  for (;; ptr++)
2859              continue;              continue;
2860    
2861              case ESC_D:              case ESC_D:
2862              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2863              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2864              continue;              continue;
2865    
# Line 2835  for (;; ptr++) Line 2868  for (;; ptr++)
2868              continue;              continue;
2869    
2870              case ESC_W:              case ESC_W:
2871              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2872              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2873              continue;              continue;
2874    
# Line 2845  for (;; ptr++) Line 2878  for (;; ptr++)
2878              continue;              continue;
2879    
2880              case ESC_S:              case ESC_S:
2881              should_flip_negation = TRUE;              should_flip_negation = TRUE;
2882              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2883              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2884              continue;              continue;
# Line 3348  we set the flag only if there is a liter Line 3381  we set the flag only if there is a liter
3381      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3382    
3383      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3384      extended class, with its own opcode, unless there was a negated special      extended class, with its own opcode, unless there was a negated special
3385      such as \S in the class, because in that case all characters > 255 are in      such as \S in the class, because in that case all characters > 255 are in
3386      the class, so any that were explicitly given as well can be ignored. If      the class, so any that were explicitly given as well can be ignored. If
3387      (when there are explicit characters > 255 that must be listed) there are no      (when there are explicit characters > 255 that must be listed) there are no
3388      characters < 256, we can omit the bitmap in the actual compiled code. */      characters < 256, we can omit the bitmap in the actual compiled code. */
3389    
# Line 3381  we set the flag only if there is a liter Line 3414  we set the flag only if there is a liter
3414        }        }
3415  #endif  #endif
3416    
3417      /* If there are no characters > 255, set the opcode to OP_CLASS or      /* If there are no characters > 255, set the opcode to OP_CLASS or
3418      OP_NCLASS, depending on whether the whole class was negated and whether      OP_NCLASS, depending on whether the whole class was negated and whether
3419      there were negative specials such as \S in the class. Then copy the 32-byte      there were negative specials such as \S in the class. Then copy the 32-byte
3420      map into the code vector, negating it if necessary. */      map into the code vector, negating it if necessary. */
3421    
3422      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3423      if (negate_class)      if (negate_class)
3424        {        {
# Line 4029  we set the flag only if there is a liter Line 4062  we set the flag only if there is a liter
4062        int len;        int len;
4063        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4064            *tempcode == OP_NOTEXACT)            *tempcode == OP_NOTEXACT)
4065          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += _pcre_OP_lengths[*tempcode] +
4066              ((*tempcode == OP_TYPEEXACT &&
4067                 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4068        len = code - tempcode;        len = code - tempcode;
4069        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4070          {          {
# Line 4256  we set the flag only if there is a liter Line 4291  we set the flag only if there is a liter
4291              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4292              goto FAILED;              goto FAILED;
4293              }              }
4294            recno = (refsign == '-')?            recno = (refsign == '-')?
4295              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno +cd->bracount;
4296            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
4297              {              {
# Line 4335  we set the flag only if there is a liter Line 4370  we set the flag only if there is a liter
4370            }            }
4371    
4372          /* Check for the "name" actually being a subpattern number. We are          /* Check for the "name" actually being a subpattern number. We are
4373          in the second pass here, so final_bracount is set. */          in the second pass here, so final_bracount is set. */
4374    
4375          else if (recno > 0 && recno <= cd->final_bracount)          else if (recno > 0 && recno <= cd->final_bracount)
4376            {            {
# Line 4549  we set the flag only if there is a liter Line 4584  we set the flag only if there is a liter
4584              {              {
4585              *errorcodeptr = ERR62;              *errorcodeptr = ERR62;
4586              goto FAILED;              goto FAILED;
4587              }              }
4588            if (*ptr != terminator)            if (*ptr != terminator)
4589              {              {
4590              *errorcodeptr = ERR42;              *errorcodeptr = ERR42;
# Line 4563  we set the flag only if there is a liter Line 4598  we set the flag only if there is a liter
4598            recno = 0;            recno = 0;
4599            }            }
4600    
4601          /* In the real compile, seek the name in the table. We check the name          /* In the real compile, seek the name in the table. We check the name
4602          first, and then check that we have reached the end of the name in the          first, and then check that we have reached the end of the name in the
4603          table. That way, if the name that is longer than any in the table,          table. That way, if the name that is longer than any in the table,
4604          the comparison will fail without reading beyond the table entry. */          the comparison will fail without reading beyond the table entry. */
4605    
# Line 4574  we set the flag only if there is a liter Line 4609  we set the flag only if there is a liter
4609            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4610              {              {
4611              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4612                  slot[2+namelen] == 0)                  slot[2+namelen] == 0)
4613                break;                break;
4614              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4615              }              }
# Line 4612  we set the flag only if there is a liter Line 4647  we set the flag only if there is a liter
4647            {            {
4648            const uschar *called;            const uschar *called;
4649    
4650            if ((refsign = *ptr) == '+')            if ((refsign = *ptr) == '+')
4651              {              {
4652              ptr++;              ptr++;
4653              if ((digitab[*ptr] & ctype_digit) == 0)              if ((digitab[*ptr] & ctype_digit) == 0)
4654                {                {
4655                *errorcodeptr = ERR63;                *errorcodeptr = ERR63;
4656                goto FAILED;                goto FAILED;
4657                }                }
4658              }              }
4659            else if (refsign == '-')            else if (refsign == '-')
4660              {              {
4661              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
# Line 5786  to fill in forward references to subpatt Line 5821  to fill in forward references to subpatt
5821    
5822  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
5823    
   
5824  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
5825    
5826  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;

Legend:
Removed from v.282  
changed lines
  Added in v.305

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12