/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 391 by ph10, Tue Mar 17 21:16:01 2009 UTC revision 392 by ph10, Tue Mar 17 21:30:30 2009 UTC
# Line 100  is invalid. */ Line 100  is invalid. */
100  #ifndef EBCDIC  #ifndef EBCDIC
101    
102  /* This is the "normal" table for ASCII systems or for EBCDIC systems running  /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103  in UTF-8 mode. */  in UTF-8 mode. */
104    
105  static const short int escapes[] = {  static const short int escapes[] = {
106       0,                       0,       0,                       0,
107         0,                       0,
108         0,                       0,
109       0,                       0,       0,                       0,
      0,                       0,  
110       0,                       0,       0,                       0,
      0,                       0,  
111       CHAR_COLON,              CHAR_SEMICOLON,       CHAR_COLON,              CHAR_SEMICOLON,
112       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
113       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
114       CHAR_COMMERCIAL_AT,      -ESC_A,       CHAR_COMMERCIAL_AT,      -ESC_A,
115       -ESC_B,                  -ESC_C,       -ESC_B,                  -ESC_C,
116       -ESC_D,                  -ESC_E,       -ESC_D,                  -ESC_E,
117       0,                       -ESC_G,       0,                       -ESC_G,
118       -ESC_H,                  0,       -ESC_H,                  0,
119       0,                       -ESC_K,       0,                       -ESC_K,
120       0,                       0,       0,                       0,
121       0,                       0,       0,                       0,
122       -ESC_P,                  -ESC_Q,       -ESC_P,                  -ESC_Q,
123       -ESC_R,                  -ESC_S,       -ESC_R,                  -ESC_S,
124       0,                       0,       0,                       0,
125       -ESC_V,                  -ESC_W,       -ESC_V,                  -ESC_W,
126       -ESC_X,                  0,       -ESC_X,                  0,
127       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,       -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
128       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,       CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
129       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,       CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
130       CHAR_GRAVE_ACCENT,       7,       CHAR_GRAVE_ACCENT,       7,
131       -ESC_b,                  0,       -ESC_b,                  0,
132       -ESC_d,                  ESC_e,       -ESC_d,                  ESC_e,
133       ESC_f,                   0,       ESC_f,                   0,
134       -ESC_h,                  0,       -ESC_h,                  0,
135       0,                       -ESC_k,       0,                       -ESC_k,
136       0,                       0,       0,                       0,
137       ESC_n,                   0,       ESC_n,                   0,
138       -ESC_p,                  0,       -ESC_p,                  0,
139       ESC_r,                   -ESC_s,       ESC_r,                   -ESC_s,
140       ESC_tee,                 0,       ESC_tee,                 0,
141       -ESC_v,                  -ESC_w,       -ESC_v,                  -ESC_w,
142       0,                       0,       0,                       0,
143       -ESC_z       -ESC_z
144  };  };
145    
146  #else  #else
147    
148  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */  /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
# Line 177  static const short int escapes[] = { Line 177  static const short int escapes[] = {
177    
178  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
180  the number of relocations when a shared library is dynamically linked. The  the number of relocations when a shared library is dynamically linked. The
181  string is built from string macros so that it works in UTF-8 mode on EBCDIC  string is built from string macros so that it works in UTF-8 mode on EBCDIC
182  platforms. */  platforms. */
183    
184  typedef struct verbitem {  typedef struct verbitem {
# Line 215  length entry. The first three must be al Line 215  length entry. The first three must be al
215  for handling case independence. */  for handling case independence. */
216    
217  static const char posix_names[] =  static const char posix_names[] =
218    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221    STRING_word0  STRING_xdigit;    STRING_word0  STRING_xdigit;
222    
# Line 360  For convenience, we use the same bit def Line 360  For convenience, we use the same bit def
360    
361  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363  #ifndef EBCDIC  #ifndef EBCDIC
364    
365  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in  /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366  UTF-8 mode. */  UTF-8 mode. */
367    
368  static const unsigned char digitab[] =  static const unsigned char digitab[] =
# Line 400  static const unsigned char digitab[] = Line 400  static const unsigned char digitab[] =
400    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403  #else  #else
404    
405  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */  /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
# Line 1057  for (; *ptr != 0; ptr++) Line 1057  for (; *ptr != 0; ptr++)
1057    /* Skip over character classes; this logic must be similar to the way they    /* Skip over character classes; this logic must be similar to the way they
1058    are handled for real. If the first character is '^', skip it. Also, if the    are handled for real. If the first character is '^', skip it. Also, if the
1059    first few characters (either before or after ^) are \Q\E or \E we skip them    first few characters (either before or after ^) are \Q\E or \E we skip them
1060    too. This makes for compatibility with Perl. Note the use of STR macros to    too. This makes for compatibility with Perl. Note the use of STR macros to
1061    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */    encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1062    
1063    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
# Line 1068  for (; *ptr != 0; ptr++) Line 1068  for (; *ptr != 0; ptr++)
1068        int c = *(++ptr);        int c = *(++ptr);
1069        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
1070          {          {
1071          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
1072            ptr++;            ptr++;
1073          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+1,
1074                   STR_Q STR_BACKSLASH STR_E, 3) == 0)                   STR_Q STR_BACKSLASH STR_E, 3) == 0)
1075            ptr += 3;            ptr += 3;
1076          else          else
1077            break;            break;
1078          }          }
1079        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
# Line 1084  for (; *ptr != 0; ptr++) Line 1084  for (; *ptr != 0; ptr++)
1084      /* If the next character is ']', it is a data character that must be      /* If the next character is ']', it is a data character that must be
1085      skipped, except in JavaScript compatibility mode. */      skipped, except in JavaScript compatibility mode. */
1086    
1087      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&      if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1088          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1089        ptr++;        ptr++;
1090    
# Line 1130  for (; *ptr != 0; ptr++) Line 1130  for (; *ptr != 0; ptr++)
1130    
1131    /* We have to disambiguate (?<! and (?<= from (?<name> */    /* We have to disambiguate (?<! and (?<= from (?<name> */
1132    
1133    if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||    if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||
1134        ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)        ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)
1135      continue;      continue;
1136    
# Line 2173  if ((options & PCRE_EXTENDED) != 0) Line 2173  if ((options & PCRE_EXTENDED) != 0)
2173    
2174  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2175    
2176  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2177    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2178      return FALSE;      return FALSE;
2179    
# Line 2639  for (;; ptr++) Line 2639  for (;; ptr++)
2639    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2640    a quantifier. */    a quantifier. */
2641    
2642    is_quantifier =    is_quantifier =
2643      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2644      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2645    
# Line 2759  for (;; ptr++) Line 2759  for (;; ptr++)
2759      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2760      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2761    
2762      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2763           ptr[1] == CHAR_EQUALS_SIGN) &&           ptr[1] == CHAR_EQUALS_SIGN) &&
2764          check_posix_syntax(ptr, &tempptr))          check_posix_syntax(ptr, &tempptr))
2765        {        {
# Line 2777  for (;; ptr++) Line 2777  for (;; ptr++)
2777        c = *(++ptr);        c = *(++ptr);
2778        if (c == CHAR_BACKSLASH)        if (c == CHAR_BACKSLASH)
2779          {          {
2780          if (ptr[1] == CHAR_E)          if (ptr[1] == CHAR_E)
2781            ptr++;            ptr++;
2782          else if (strncmp((const char *)ptr+1,          else if (strncmp((const char *)ptr+1,
2783                            STR_Q STR_BACKSLASH STR_E, 3) == 0)                            STR_Q STR_BACKSLASH STR_E, 3) == 0)
2784            ptr += 3;            ptr += 3;
2785          else          else
2786            break;            break;
2787          }          }
2788        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
# Line 2795  for (;; ptr++) Line 2795  for (;; ptr++)
2795      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas      that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2796      [^] must match any character, so generate OP_ALLANY. */      [^] must match any character, so generate OP_ALLANY. */
2797    
2798      if (c == CHAR_RIGHT_SQUARE_BRACKET &&      if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2799          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)          (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2800        {        {
2801        *code++ = negate_class? OP_ALLANY : OP_FAIL;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
# Line 2877  for (;; ptr++) Line 2877  for (;; ptr++)
2877        5.6 and 5.8 do. */        5.6 and 5.8 do. */
2878    
2879        if (c == CHAR_LEFT_SQUARE_BRACKET &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
2880            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2881             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2882          {          {
2883          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
# Line 3227  for (;; ptr++) Line 3227  for (;; ptr++)
3227          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3228            {            {
3229            ptr += 2;            ptr += 2;
3230            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3231              { ptr += 2; continue; }              { ptr += 2; continue; }
3232            inescq = TRUE;            inescq = TRUE;
3233            break;            break;
# Line 4427  we set the flag only if there is a liter Line 4427  we set the flag only if there is a liter
4427            }            }
4428          namelen = ptr - name;          namelen = ptr - name;
4429    
4430          if ((terminator > 0 && *ptr++ != terminator) ||          if ((terminator > 0 && *ptr++ != terminator) ||
4431              *ptr++ != CHAR_RIGHT_PARENTHESIS)              *ptr++ != CHAR_RIGHT_PARENTHESIS)
4432            {            {
4433            ptr--;      /* Error offset */            ptr--;      /* Error offset */
# Line 4626  we set the flag only if there is a liter Line 4626  we set the flag only if there is a liter
4626    
4627          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4628          case CHAR_P:              /* Python-style named subpattern handling */          case CHAR_P:              /* Python-style named subpattern handling */
4629          if (*(++ptr) == CHAR_EQUALS_SIGN ||          if (*(++ptr) == CHAR_EQUALS_SIGN ||
4630              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */              *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4631            {            {
4632            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
# Line 4645  we set the flag only if there is a liter Line 4645  we set the flag only if there is a liter
4645          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
4646          case CHAR_APOSTROPHE:          case CHAR_APOSTROPHE:
4647            {            {
4648            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4649              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4650            name = ++ptr;            name = ++ptr;
4651    
# Line 5240  we set the flag only if there is a liter Line 5240  we set the flag only if there is a liter
5240        {        {
5241        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
5242          {          {
5243          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5244            ptr += 2;               /* avoid empty string */            ptr += 2;               /* avoid empty string */
5245              else inescq = TRUE;              else inescq = TRUE;
5246          continue;          continue;
# Line 5270  we set the flag only if there is a liter Line 5270  we set the flag only if there is a liter
5270          {          {
5271          const uschar *p;          const uschar *p;
5272          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */          save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5273          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5274            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5275    
5276          /* These two statements stop the compiler for warning about possibly          /* These two statements stop the compiler for warning about possibly
# Line 5321  we set the flag only if there is a liter Line 5321  we set the flag only if there is a liter
5321        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5322        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
5323    
5324        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5325            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))            ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5326          {          {
5327          is_recurse = FALSE;          is_recurse = FALSE;
5328          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5329            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?            CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5330            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;            CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5331          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
5332          }          }
# Line 5879  do { Line 5879  do {
5879     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5880       NULL, 0, FALSE);       NULL, 0, FALSE);
5881     register int op = *scode;     register int op = *scode;
5882    
5883     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
5884     conditional assertion *and* what follows the condition must satisfy the test     conditional assertion *and* what follows the condition must satisfy the test
5885     for start of line. Other kinds of condition fail. Note that there may be an     for start of line. Other kinds of condition fail. Note that there may be an
# Line 5887  do { Line 5887  do {
5887    
5888     if (op == OP_COND)     if (op == OP_COND)
5889       {       {
5890       scode += 1 + LINK_SIZE;       scode += 1 + LINK_SIZE;
5891       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];       if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
5892       switch (*scode)       switch (*scode)
5893         {         {
5894         case OP_CREF:         case OP_CREF:
5895         case OP_RREF:         case OP_RREF:
5896         case OP_DEF:         case OP_DEF:
5897         return FALSE;         return FALSE;
5898    
5899         default:     /* Assertion */         default:     /* Assertion */
5900         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5901         do scode += GET(scode, 1); while (*scode == OP_ALT);         do scode += GET(scode, 1); while (*scode == OP_ALT);
5902         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
5903         break;         break;
5904         }         }
5905       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, NULL, 0, FALSE);
5906       op = *scode;       op = *scode;
5907       }       }
5908    
5909     /* Non-capturing brackets */     /* Non-capturing brackets */
5910    
# Line 5925  do { Line 5925  do {
5925     /* Other brackets */     /* Other brackets */
5926    
5927     else if (op == OP_ASSERT || op == OP_ONCE)     else if (op == OP_ASSERT || op == OP_ONCE)
5928       {       {
5929       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5930       }       }
5931    
5932     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
# Line 6141  cd->ctypes = tables + ctypes_offset; Line 6141  cd->ctypes = tables + ctypes_offset;
6141  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6142  the offset for later. */  the offset for later. */
6143    
6144  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6145         ptr[skipatstart+1] == CHAR_ASTERISK)         ptr[skipatstart+1] == CHAR_ASTERISK)
6146    {    {
6147    int newnl = 0;    int newnl = 0;

Legend:
Removed from v.391  
changed lines
  Added in v.392

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12