/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 145 by ph10, Wed Apr 4 14:06:52 2007 UTC revision 195 by ph10, Mon Jul 30 13:23:28 2007 UTC
# Line 58  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
61    /* Macro for setting individual bits in class bitmaps. */
62    
63    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66  /*************************************************  /*************************************************
67  *      Code parameters and static tables         *  *      Code parameters and static tables         *
68  *************************************************/  *************************************************/
# Line 87  static const short int escapes[] = { Line 92  static const short int escapes[] = {
92       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
93       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
95       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
96  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
97  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
98     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
99       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
100  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
101       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
102  };  };
103    
# Line 106  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
112  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
113  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
114  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
115  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
116  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
117  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
118  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
119  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
120  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
121  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
122  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
123  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
124  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
125  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
126  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
127  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
128  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 208  static const char *error_texts[] = { Line 213  static const char *error_texts[] = {
213    "malformed number or name after (?(",    "malformed number or name after (?(",
214    "conditional group contains more than two branches",    "conditional group contains more than two branches",
215    "assertion expected after (?(",    "assertion expected after (?(",
216    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
217    /* 30 */    /* 30 */
218    "unknown POSIX class name",    "unknown POSIX class name",
219    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
# Line 242  static const char *error_texts[] = { Line 247  static const char *error_texts[] = {
247    /* 55 */    /* 55 */
248    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
249    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
250    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by a braced name or an optionally braced non-zero number",
251      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252  };  };
253    
254    
# Line 373  static const unsigned char ebcdic_charta Line 379  static const unsigned char ebcdic_charta
379  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
380    
381  static BOOL  static BOOL
382    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383      int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
384    
385    
386    
# Line 452  else Line 458  else
458    
459      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
460      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
461      This is a Perl 5.10 feature. */      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462        reference to a named group. This is part of Perl's movement towards a
463        unified syntax for back references. As this is synonymous with \k{name}, we
464        fudge it up by pretending it really was \k. */
465    
466      case 'g':      case 'g':
467      if (ptr[1] == '{')      if (ptr[1] == '{')
468        {        {
469          const uschar *p;
470          for (p = ptr+2; *p != 0 && *p != '}'; p++)
471            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472          if (*p != 0 && *p != '}')
473            {
474            c = -ESC_k;
475            break;
476            }
477        braced = TRUE;        braced = TRUE;
478        ptr++;        ptr++;
479        }        }
# Line 1370  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387    
1388    c = *code;    c = *code;
1389    
1390      /* Groups with zero repeats can of course be empty; skip them. */
1391    
1392      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393        {
1394        code += _pcre_OP_lengths[c];
1395        do code += GET(code, 1); while (*code == OP_ALT);
1396        c = *code;
1397        continue;
1398        }
1399    
1400      /* For other groups, scan the branches. */
1401    
1402    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403      {      {
1404      BOOL empty_branch;      BOOL empty_branch;
# Line 1386  for (code = first_significant_code(code Line 1415  for (code = first_significant_code(code
1415        }        }
1416      while (*code == OP_ALT);      while (*code == OP_ALT);
1417      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1418        c = *code;
     /* Move past the KET and fudge things so that the increment in the "for"  
     above has no effect. */  
   
     c = OP_END;  
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
1419      continue;      continue;
1420      }      }
1421    
# Line 1925  if (next >= 0) switch(op_code) Line 1949  if (next >= 0) switch(op_code)
1949    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
1950    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;    return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952      case OP_HSPACE:
1953      case OP_NOT_HSPACE:
1954      switch(next)
1955        {
1956        case 0x09:
1957        case 0x20:
1958        case 0xa0:
1959        case 0x1680:
1960        case 0x180e:
1961        case 0x2000:
1962        case 0x2001:
1963        case 0x2002:
1964        case 0x2003:
1965        case 0x2004:
1966        case 0x2005:
1967        case 0x2006:
1968        case 0x2007:
1969        case 0x2008:
1970        case 0x2009:
1971        case 0x200A:
1972        case 0x202f:
1973        case 0x205f:
1974        case 0x3000:
1975        return op_code != OP_HSPACE;
1976        default:
1977        return op_code == OP_HSPACE;
1978        }
1979    
1980      case OP_VSPACE:
1981      case OP_NOT_VSPACE:
1982      switch(next)
1983        {
1984        case 0x0a:
1985        case 0x0b:
1986        case 0x0c:
1987        case 0x0d:
1988        case 0x85:
1989        case 0x2028:
1990        case 0x2029:
1991        return op_code != OP_VSPACE;
1992        default:
1993        return op_code == OP_VSPACE;
1994        }
1995    
1996    default:    default:
1997    return FALSE;    return FALSE;
1998    }    }
# Line 1959  switch(op_code) Line 2027  switch(op_code)
2027      case ESC_W:      case ESC_W:
2028      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2029    
2030        case ESC_h:
2031        case ESC_H:
2032        switch(item)
2033          {
2034          case 0x09:
2035          case 0x20:
2036          case 0xa0:
2037          case 0x1680:
2038          case 0x180e:
2039          case 0x2000:
2040          case 0x2001:
2041          case 0x2002:
2042          case 0x2003:
2043          case 0x2004:
2044          case 0x2005:
2045          case 0x2006:
2046          case 0x2007:
2047          case 0x2008:
2048          case 0x2009:
2049          case 0x200A:
2050          case 0x202f:
2051          case 0x205f:
2052          case 0x3000:
2053          return -next != ESC_h;
2054          default:
2055          return -next == ESC_h;
2056          }
2057    
2058        case ESC_v:
2059        case ESC_V:
2060        switch(item)
2061          {
2062          case 0x0a:
2063          case 0x0b:
2064          case 0x0c:
2065          case 0x0d:
2066          case 0x85:
2067          case 0x2028:
2068          case 0x2029:
2069          return -next != ESC_v;
2070          default:
2071          return -next == ESC_v;
2072          }
2073    
2074      default:      default:
2075      return FALSE;      return FALSE;
2076      }      }
2077    
2078    case OP_DIGIT:    case OP_DIGIT:
2079    return next == -ESC_D || next == -ESC_s || next == -ESC_W;    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2080             next == -ESC_h || next == -ESC_v;
2081    
2082    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
2083    return next == -ESC_d;    return next == -ESC_d;
# Line 1973  switch(op_code) Line 2086  switch(op_code)
2086    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2087    
2088    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
2089    return next == -ESC_s;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2090    
2091      case OP_HSPACE:
2092      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2093    
2094      case OP_NOT_HSPACE:
2095      return next == -ESC_h;
2096    
2097      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2098      case OP_VSPACE:
2099      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2100    
2101      case OP_NOT_VSPACE:
2102      return next == -ESC_v;
2103    
2104    case OP_WORDCHAR:    case OP_WORDCHAR:
2105    return next == -ESC_W || next == -ESC_s;    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2106    
2107    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
2108    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2091  for (;; ptr++) Line 2217  for (;; ptr++)
2217    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2218    BOOL is_quantifier;    BOOL is_quantifier;
2219    BOOL is_recurse;    BOOL is_recurse;
2220      BOOL reset_bracount;
2221    int class_charcount;    int class_charcount;
2222    int class_lastchar;    int class_lastchar;
2223    int newoptions;    int newoptions;
2224    int recno;    int recno;
2225      int refsign;
2226    int skipbytes;    int skipbytes;
2227    int subreqbyte;    int subreqbyte;
2228    int subfirstbyte;    int subfirstbyte;
# Line 2519  for (;; ptr++) Line 2647  for (;; ptr++)
2647            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2648                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2649    
2650              /* We need to deal with \H, \h, \V, and \v in both phases because
2651              they use extra memory. */
2652    
2653              if (-c == ESC_h)
2654                {
2655                SETBIT(classbits, 0x09); /* VT */
2656                SETBIT(classbits, 0x20); /* SPACE */
2657                SETBIT(classbits, 0xa0); /* NSBP */
2658    #ifdef SUPPORT_UTF8
2659                if (utf8)
2660                  {
2661                  class_utf8 = TRUE;
2662                  *class_utf8data++ = XCL_SINGLE;
2663                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2664                  *class_utf8data++ = XCL_SINGLE;
2665                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2666                  *class_utf8data++ = XCL_RANGE;
2667                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2668                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2669                  *class_utf8data++ = XCL_SINGLE;
2670                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2671                  *class_utf8data++ = XCL_SINGLE;
2672                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2673                  *class_utf8data++ = XCL_SINGLE;
2674                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2675                  }
2676    #endif
2677                continue;
2678                }
2679    
2680              if (-c == ESC_H)
2681                {
2682                for (c = 0; c < 32; c++)
2683                  {
2684                  int x = 0xff;
2685                  switch (c)
2686                    {
2687                    case 0x09/8: x ^= 1 << (0x09%8); break;
2688                    case 0x20/8: x ^= 1 << (0x20%8); break;
2689                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2690                    default: break;
2691                    }
2692                  classbits[c] |= x;
2693                  }
2694    
2695    #ifdef SUPPORT_UTF8
2696                if (utf8)
2697                  {
2698                  class_utf8 = TRUE;
2699                  *class_utf8data++ = XCL_RANGE;
2700                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2701                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2702                  *class_utf8data++ = XCL_RANGE;
2703                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2704                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2705                  *class_utf8data++ = XCL_RANGE;
2706                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2707                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2708                  *class_utf8data++ = XCL_RANGE;
2709                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2710                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2711                  *class_utf8data++ = XCL_RANGE;
2712                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2713                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2714                  *class_utf8data++ = XCL_RANGE;
2715                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2716                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2717                  *class_utf8data++ = XCL_RANGE;
2718                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2719                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2720                  }
2721    #endif
2722                continue;
2723                }
2724    
2725              if (-c == ESC_v)
2726                {
2727                SETBIT(classbits, 0x0a); /* LF */
2728                SETBIT(classbits, 0x0b); /* VT */
2729                SETBIT(classbits, 0x0c); /* FF */
2730                SETBIT(classbits, 0x0d); /* CR */
2731                SETBIT(classbits, 0x85); /* NEL */
2732    #ifdef SUPPORT_UTF8
2733                if (utf8)
2734                  {
2735                  class_utf8 = TRUE;
2736                  *class_utf8data++ = XCL_RANGE;
2737                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2738                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2739                  }
2740    #endif
2741                continue;
2742                }
2743    
2744              if (-c == ESC_V)
2745                {
2746                for (c = 0; c < 32; c++)
2747                  {
2748                  int x = 0xff;
2749                  switch (c)
2750                    {
2751                    case 0x0a/8: x ^= 1 << (0x0a%8);
2752                                 x ^= 1 << (0x0b%8);
2753                                 x ^= 1 << (0x0c%8);
2754                                 x ^= 1 << (0x0d%8);
2755                                 break;
2756                    case 0x85/8: x ^= 1 << (0x85%8); break;
2757                    default: break;
2758                    }
2759                  classbits[c] |= x;
2760                  }
2761    
2762    #ifdef SUPPORT_UTF8
2763                if (utf8)
2764                  {
2765                  class_utf8 = TRUE;
2766                  *class_utf8data++ = XCL_RANGE;
2767                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2772                  }
2773    #endif
2774                continue;
2775                }
2776    
2777            /* We need to deal with \P and \p in both phases. */            /* We need to deal with \P and \p in both phases. */
2778    
2779  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
# Line 2659  for (;; ptr++) Line 2914  for (;; ptr++)
2914              unsigned int origd = d;              unsigned int origd = d;
2915              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2916                {                {
2917                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2918                      ocd <= (unsigned int)d)
2919                    continue;                          /* Skip embedded ranges */
2920    
2921                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2922                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2923                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2924                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2925                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2926                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2927                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2928                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2929                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2930                  d = ocd;                  d = ocd;
2931                  continue;                  continue;
# Line 3564  for (;; ptr++) Line 3823  for (;; ptr++)
3823      skipbytes = 0;      skipbytes = 0;
3824      bravalue = OP_CBRA;      bravalue = OP_CBRA;
3825      save_hwm = cd->hwm;      save_hwm = cd->hwm;
3826        reset_bracount = FALSE;
3827    
3828      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3829        {        {
# Line 3586  for (;; ptr++) Line 3846  for (;; ptr++)
3846    
3847    
3848          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
3849            case '|':                 /* Reset capture count for each branch */
3850            reset_bracount = TRUE;
3851            /* Fall through */
3852    
3853            /* ------------------------------------------------------------ */
3854          case ':':                 /* Non-capturing bracket */          case ':':                 /* Non-capturing bracket */
3855          bravalue = OP_BRA;          bravalue = OP_BRA;
3856          ptr++;          ptr++;
# Line 3621  for (;; ptr++) Line 3886  for (;; ptr++)
3886    
3887          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3888          skipbytes = 3;          skipbytes = 3;
3889            refsign = -1;
3890    
3891          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3892    
# Line 3644  for (;; ptr++) Line 3910  for (;; ptr++)
3910            terminator = '\'';            terminator = '\'';
3911            ptr++;            ptr++;
3912            }            }
3913          else terminator = 0;          else
3914              {
3915              terminator = 0;
3916              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3917              }
3918    
3919          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3920    
# Line 3680  for (;; ptr++) Line 3950  for (;; ptr++)
3950          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
3951    
3952          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3953          reference. */          reference. If the string started with "+" or "-" we require the rest to
3954            be digits, in which case recno will be set. */
3955    
3956            if (refsign > 0)
3957              {
3958              if (recno <= 0)
3959                {
3960                *errorcodeptr = ERR58;
3961                goto FAILED;
3962                }
3963              if (refsign == '-')
3964                {
3965                recno = cd->bracount - recno + 1;
3966                if (recno <= 0)
3967                  {
3968                  *errorcodeptr = ERR15;
3969                  goto FAILED;
3970                  }
3971                }
3972              else recno += cd->bracount;
3973              PUT2(code, 2+LINK_SIZE, recno);
3974              break;
3975              }
3976    
3977            /* Otherwise (did not start with "+" or "-"), start by looking for the
3978            name. */
3979    
3980          slot = cd->name_table;          slot = cd->name_table;
3981          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 3999  for (;; ptr++) Line 4294  for (;; ptr++)
4294    
4295    
4296          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4297            case '-': case '+':
4298          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4299          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4300            {            {
4301            const uschar *called;            const uschar *called;
4302    
4303              if ((refsign = *ptr) == '+') ptr++;
4304              else if (refsign == '-')
4305                {
4306                if ((digitab[ptr[1]] & ctype_digit) == 0)
4307                  goto OTHER_CHAR_AFTER_QUERY;
4308                ptr++;
4309                }
4310    
4311            recno = 0;            recno = 0;
4312            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4313              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4314    
4315            if (*ptr != ')')            if (*ptr != ')')
4316              {              {
4317              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4318              goto FAILED;              goto FAILED;
4319              }              }
4320    
4321              if (refsign == '-')
4322                {
4323                if (recno == 0)
4324                  {
4325                  *errorcodeptr = ERR58;
4326                  goto FAILED;
4327                  }
4328                recno = cd->bracount - recno + 1;
4329                if (recno <= 0)
4330                  {
4331                  *errorcodeptr = ERR15;
4332                  goto FAILED;
4333                  }
4334                }
4335              else if (refsign == '+')
4336                {
4337                if (recno == 0)
4338                  {
4339                  *errorcodeptr = ERR58;
4340                  goto FAILED;
4341                  }
4342                recno += cd->bracount;
4343                }
4344    
4345            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4346    
4347            HANDLE_RECURSION:            HANDLE_RECURSION:
# Line 4084  for (;; ptr++) Line 4414  for (;; ptr++)
4414    
4415          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4416          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4417            OTHER_CHAR_AFTER_QUERY:
4418          set = unset = 0;          set = unset = 0;
4419          optset = &set;          optset = &set;
4420    
# Line 4218  for (;; ptr++) Line 4549  for (;; ptr++)
4549           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4550           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4551            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4552             reset_bracount,               /* True if (?| group */
4553           skipbytes,                    /* Skip over bracket number */           skipbytes,                    /* Skip over bracket number */
4554           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4555           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
# Line 4234  for (;; ptr++) Line 4566  for (;; ptr++)
4566      is on the bracket. */      is on the bracket. */
4567    
4568      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4569      two branches in the group, or just one if it's a DEFINE group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4570        in the real compile phase, not in the pre-pass, where the whole group may
4571        not be available. */
4572    
4573      if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4574        {        {
4575        uschar *tc = code;        uschar *tc = code;
4576        int condcount = 0;        int condcount = 0;
# Line 4396  for (;; ptr++) Line 4730  for (;; ptr++)
4730        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4731        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4732    
4733        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4734          We also support \k{name} (.NET syntax) */
4735    
4736        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4737          {          {
4738          is_recurse = FALSE;          is_recurse = FALSE;
4739          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4740          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
4741          }          }
4742    
# Line 4567  This function is used during the pre-com Line 4902  This function is used during the pre-com
4902  out the amount of memory needed, as well as during the real compile phase. The  out the amount of memory needed, as well as during the real compile phase. The
4903  value of lengthptr distinguishes the two phases.  value of lengthptr distinguishes the two phases.
4904    
4905  Argument:  Arguments:
4906    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4907    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
4908    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4909    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4910    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4911    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4912      reset_bracount TRUE to reset the count for each branch
4913    skipbytes      skip this many bytes at start (for brackets and OP_COND)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4914    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4915    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
# Line 4587  Returns: TRUE on success Line 4923  Returns: TRUE on success
4923    
4924  static BOOL  static BOOL
4925  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4926    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4927    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4928      int *lengthptr)
4929  {  {
4930  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4931  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 4598  uschar *reverse_count = NULL; Line 4935  uschar *reverse_count = NULL;
4935  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4936  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4937  int length;  int length;
4938    int orig_bracount;
4939    int max_bracount;
4940  branch_chain bc;  branch_chain bc;
4941    
4942  bc.outer = bcptr;  bc.outer = bcptr;
# Line 4626  code += 1 + LINK_SIZE + skipbytes; Line 4965  code += 1 + LINK_SIZE + skipbytes;
4965    
4966  /* Loop for each alternative branch */  /* Loop for each alternative branch */
4967    
4968    orig_bracount = max_bracount = cd->bracount;
4969  for (;;)  for (;;)
4970    {    {
4971      /* For a (?| group, reset the capturing bracket count so that each branch
4972      uses the same numbers. */
4973    
4974      if (reset_bracount) cd->bracount = orig_bracount;
4975    
4976    /* Handle a change of ims options at the start of the branch */    /* Handle a change of ims options at the start of the branch */
4977    
4978    if ((options & PCRE_IMS) != oldims)    if ((options & PCRE_IMS) != oldims)
# Line 4657  for (;;) Line 5002  for (;;)
5002      return FALSE;      return FALSE;
5003      }      }
5004    
5005      /* Keep the highest bracket count in case (?| was used and some branch
5006      has fewer than the rest. */
5007    
5008      if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5009    
5010    /* In the real compile phase, there is some post-processing to be done. */    /* In the real compile phase, there is some post-processing to be done. */
5011    
5012    if (lengthptr == NULL)    if (lengthptr == NULL)
# Line 4720  for (;;) Line 5070  for (;;)
5070        }        }
5071      }      }
5072    
5073    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. In the real
5074    the alternative branches and reverse the chain of offsets, with the field in    compile phase, go back through the alternative branches and reverse the chain
5075    the BRA item now becoming an offset to the first alternative. If there are    of offsets, with the field in the BRA item now becoming an offset to the
5076    no alternatives, it points to the end of the group. The length in the    first alternative. If there are no alternatives, it points to the end of the
5077    terminating ket is always the length of the whole bracketed item. If any of    group. The length in the terminating ket is always the length of the whole
5078    the ims options were changed inside the group, compile a resetting op-code    bracketed item. If any of the ims options were changed inside the group,
5079    following, except at the very end of the pattern. Return leaving the pointer    compile a resetting op-code following, except at the very end of the pattern.
5080    at the terminating char. */    Return leaving the pointer at the terminating char. */
5081    
5082    if (*ptr != '|')    if (*ptr != '|')
5083      {      {
5084      int branch_length = code - last_branch;      if (lengthptr == NULL)
     do  
5085        {        {
5086        int prev_length = GET(last_branch, 1);        int branch_length = code - last_branch;
5087        PUT(last_branch, 1, branch_length);        do
5088        branch_length = prev_length;          {
5089        last_branch -= branch_length;          int prev_length = GET(last_branch, 1);
5090            PUT(last_branch, 1, branch_length);
5091            branch_length = prev_length;
5092            last_branch -= branch_length;
5093            }
5094          while (branch_length > 0);
5095        }        }
     while (branch_length > 0);  
5096    
5097      /* Fill in the ket */      /* Fill in the ket */
5098    
# Line 4756  for (;;) Line 5109  for (;;)
5109        length += 2;        length += 2;
5110        }        }
5111    
5112        /* Retain the highest bracket number, in case resetting was used. */
5113    
5114        cd->bracount = max_bracount;
5115    
5116      /* Set values to pass back */      /* Set values to pass back */
5117    
5118      *codeptr = code;      *codeptr = code;
# Line 4766  for (;;) Line 5123  for (;;)
5123      return TRUE;      return TRUE;
5124      }      }
5125    
5126    /* Another branch follows; insert an "or" node. Its length field points back    /* Another branch follows. In the pre-compile phase, we can move the code
5127      pointer back to where it was for the start of the first branch. (That is,
5128      pretend that each branch is the only one.)
5129    
5130      In the real compile phase, insert an ALT node. Its length field points back
5131    to the previous branch while the bracket remains open. At the end the chain    to the previous branch while the bracket remains open. At the end the chain
5132    is reversed. It's done like this so that the start of the bracket has a    is reversed. It's done like this so that the start of the bracket has a
5133    zero offset until it is closed, making it possible to detect recursion. */    zero offset until it is closed, making it possible to detect recursion. */
5134    
5135    *code = OP_ALT;    if (lengthptr != NULL)
5136    PUT(code, 1, code - last_branch);      {
5137    bc.current = last_branch = code;      code = *codeptr + 1 + LINK_SIZE + skipbytes;
5138    code += 1 + LINK_SIZE;      length += 1 + LINK_SIZE;
5139        }
5140      else
5141        {
5142        *code = OP_ALT;
5143        PUT(code, 1, code - last_branch);
5144        bc.current = last_branch = code;
5145        code += 1 + LINK_SIZE;
5146        }
5147    
5148    ptr++;    ptr++;
   length += 1 + LINK_SIZE;  
5149    }    }
5150  /* Control never reaches here */  /* Control never reaches here */
5151  }  }
# Line 5138  cd->cbits = tables + cbits_offset; Line 5507  cd->cbits = tables + cbits_offset;
5507  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5508    
5509  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
5510  current code allows for fixed one- or two-byte sequences, plus "any". */  current code allows for fixed one- or two-byte sequences, plus "any" and
5511    "anycrlf". */
5512    
5513  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5514    {    {
# Line 5148  switch (options & (PCRE_NEWLINE_CRLF | P Line 5518  switch (options & (PCRE_NEWLINE_CRLF | P
5518    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5519         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5520    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
5521      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5522    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5523    }    }
5524    
5525  if (newline < 0)  if (newline == -2)
5526      {
5527      cd->nltype = NLTYPE_ANYCRLF;
5528      }
5529    else if (newline < 0)
5530    {    {
5531    cd->nltype = NLTYPE_ANY;    cd->nltype = NLTYPE_ANY;
5532    }    }
# Line 5212  outside can help speed up starting point Line 5587  outside can help speed up starting point
5587  code = cworkspace;  code = cworkspace;
5588  *code = OP_BRA;  *code = OP_BRA;
5589  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5590    &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5591      &length);
5592  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5593    
5594  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 5280  ptr = (const uschar *)pattern; Line 5656  ptr = (const uschar *)pattern;
5656  code = (uschar *)codestart;  code = (uschar *)codestart;
5657  *code = OP_BRA;  *code = OP_BRA;
5658  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5659    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5660  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
5661  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
5662    

Legend:
Removed from v.145  
changed lines
  Added in v.195

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12