/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 202 by ph10, Fri Aug 3 09:44:26 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 155  static const int posix_class_maps[] = { Line 180  static const int posix_class_maps[] = {
180  };  };
181    
182    
183    #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
187  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
188    they are documented. Always add a new error instead. Messages marked DEAD below
189    are no longer used. */
190    
191  static const char *error_texts[] = {  static const char *error_texts[] = {
192    "no error",    "no error",
# Line 171  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "range out of order in character class",    "range out of order in character class",
202    "nothing to repeat",    "nothing to repeat",
203    /* 10 */    /* 10 */
204    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
205    "internal error: unexpected repeat",    "internal error: unexpected repeat",
206    "unrecognized character after (?",    "unrecognized character after (?",
207    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 181  static const char *error_texts[] = { Line 211  static const char *error_texts[] = {
211    "erroffset passed as NULL",    "erroffset passed as NULL",
212    "unknown option bit(s) set",    "unknown option bit(s) set",
213    "missing ) after comment",    "missing ) after comment",
214    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
215    /* 20 */    /* 20 */
216    "regular expression too large",    "regular expression is too large",
217    "failed to get memory",    "failed to get memory",
218    "unmatched parentheses",    "unmatched parentheses",
219    "internal error: code overflow",    "internal error: code overflow",
220    "unrecognized character after (?<",    "unrecognized character after (?<",
221    /* 25 */    /* 25 */
222    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
223    "malformed number after (?(",    "malformed number or name after (?(",
224    "conditional group contains more than two branches",    "conditional group contains more than two branches",
225    "assertion expected after (?(",    "assertion expected after (?(",
226    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
227    /* 30 */    /* 30 */
228    "unknown POSIX class name",    "unknown POSIX class name",
229    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
230    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
231    "spare error",    "spare error",  /** DEAD **/
232    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
233    /* 35 */    /* 35 */
234    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 209  static const char *error_texts[] = { Line 239  static const char *error_texts[] = {
239    /* 40 */    /* 40 */
240    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
241    "unrecognized character after (?P",    "unrecognized character after (?P",
242    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
243    "two named groups have the same name",    "two named subpatterns have the same name",
244    "invalid UTF-8 string",    "invalid UTF-8 string",
245    /* 45 */    /* 45 */
246    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
247    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
248    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
249      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251      /* 50 */
252      "repeated subpattern is too long",    /** DEAD **/
253      "octal value is greater than \\377 (not in UTF-8 mode)",
254      "internal error: overran compiling workspace",
255      "internal error: previously-checked referenced subpattern not found",
256      "DEFINE group contains more than one branch",
257      /* 55 */
258      "repeating a DEFINE group is not allowed",
259      "inconsistent NEWLINE options",
260      "\\g is not followed by a braced name or an optionally braced non-zero number",
261      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262  };  };
263    
264    
# Line 235  For convenience, we use the same bit def Line 278  For convenience, we use the same bit def
278    
279  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
282  static const unsigned char digitab[] =  static const unsigned char digitab[] =
283    {    {
284    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 314  static const unsigned char digitab[] =
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
318  static const unsigned char digitab[] =  static const unsigned char digitab[] =
319    {    {
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 328  static const unsigned char digitab[] =
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
329    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
330    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
331    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
332    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
333    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
334    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 362  static const unsigned char ebcdic_charta
362    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
363    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
365    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
366    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
367    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
368    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 389  static const unsigned char ebcdic_charta
389  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
390    
391  static BOOL  static BOOL
392    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
394    
395    
396    
# Line 357  static BOOL Line 400  static BOOL
400    
401  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
402  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
403  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
404  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406    ptr is pointing at the \. On exit, it is on the final character of the escape
407    sequence.
408    
409  Arguments:  Arguments:
410    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 392  if (c == 0) *errorcodeptr = ERR1; Line 437  if (c == 0) *errorcodeptr = ERR1;
437  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
438  Otherwise further processing may be required. */  Otherwise further processing may be required. */
439    
440  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
441  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
442  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
443    
444  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
445  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
446  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
447  #endif  #endif
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 451  else if ((i = escapes[c - 0x48]) != 0)
451  else  else
452    {    {
453    const uschar *oldptr;    const uschar *oldptr;
454      BOOL braced, negated;
455    
456    switch (c)    switch (c)
457      {      {
458      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 466  else
466      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
467      break;      break;
468    
469        /* \g must be followed by a number, either plain or braced. If positive, it
470        is an absolute backreference. If negative, it is a relative backreference.
471        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472        reference to a named group. This is part of Perl's movement towards a
473        unified syntax for back references. As this is synonymous with \k{name}, we
474        fudge it up by pretending it really was \k. */
475    
476        case 'g':
477        if (ptr[1] == '{')
478          {
479          const uschar *p;
480          for (p = ptr+2; *p != 0 && *p != '}'; p++)
481            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482          if (*p != 0 && *p != '}')
483            {
484            c = -ESC_k;
485            break;
486            }
487          braced = TRUE;
488          ptr++;
489          }
490        else braced = FALSE;
491    
492        if (ptr[1] == '-')
493          {
494          negated = TRUE;
495          ptr++;
496          }
497        else negated = FALSE;
498    
499        c = 0;
500        while ((digitab[ptr[1]] & ctype_digit) != 0)
501          c = c * 10 + *(++ptr) - '0';
502    
503        if (c == 0 || (braced && *(++ptr) != '}'))
504          {
505          *errorcodeptr = ERR57;
506          return 0;
507          }
508    
509        if (negated)
510          {
511          if (c > bracount)
512            {
513            *errorcodeptr = ERR15;
514            return 0;
515            }
516          c = bracount - (c - 1);
517          }
518    
519        c = -(ESC_REF + c);
520        break;
521    
522      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
523      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
524      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 460  else Line 560  else
560        }        }
561    
562      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
563      larger first octal digit. */      larger first octal digit. The original code used just to take the least
564        significant 8 bits of octal numbers (I think this is what early Perls used
565        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566        than 3 octal digits. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
572      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
573      break;      break;
574    
575      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 589  else
589          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
590          count++;          count++;
591    
592  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
593          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
594          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
596          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
597          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598  #endif  #endif
# Line 513  else Line 616  else
616        {        {
617        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
618        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
619  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
620        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
621        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
623        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
624        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625  #endif  #endif
626        }        }
627      break;      break;
628    
629      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630        This coding is ASCII-specific, but then the whole concept of \cx is
631        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632    
633      case 'c':      case 'c':
634      c = *(++ptr);      c = *(++ptr);
# Line 533  else Line 638  else
638        return 0;        return 0;
639        }        }
640    
641      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
642      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
643      c ^= 0x40;      c ^= 0x40;
644  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
645      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
646      c ^= 0xC0;      c ^= 0xC0;
647  #endif  #endif
# Line 610  if (c == '{') Line 711  if (c == '{')
711      *negptr = TRUE;      *negptr = TRUE;
712      ptr++;      ptr++;
713      }      }
714    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
715      {      {
716      c = *(++ptr);      c = *(++ptr);
717      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 763  return p; Line 864  return p;
864    
865    
866  /*************************************************  /*************************************************
867    *       Find forward referenced subpattern       *
868    *************************************************/
869    
870    /* This function scans along a pattern's text looking for capturing
871    subpatterns, and counting them. If it finds a named pattern that matches the
872    name it is given, it returns its number. Alternatively, if the name is NULL, it
873    returns when it reaches a given numbered subpattern. This is used for forward
874    references to subpatterns. We know that if (?P< is encountered, the name will
875    be terminated by '>' because that is checked in the first pass.
876    
877    Arguments:
878      ptr          current position in the pattern
879      count        current count of capturing parens so far encountered
880      name         name to seek, or NULL if seeking a numbered subpattern
881      lorn         name length, or subpattern number if name is NULL
882      xmode        TRUE if we are in /x mode
883    
884    Returns:       the number of the named subpattern, or -1 if not found
885    */
886    
887    static int
888    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889      BOOL xmode)
890    {
891    const uschar *thisname;
892    
893    for (; *ptr != 0; ptr++)
894      {
895      int term;
896    
897      /* Skip over backslashed characters and also entire \Q...\E */
898    
899      if (*ptr == '\\')
900        {
901        if (*(++ptr) == 0) return -1;
902        if (*ptr == 'Q') for (;;)
903          {
904          while (*(++ptr) != 0 && *ptr != '\\');
905          if (*ptr == 0) return -1;
906          if (*(++ptr) == 'E') break;
907          }
908        continue;
909        }
910    
911      /* Skip over character classes */
912    
913      if (*ptr == '[')
914        {
915        while (*(++ptr) != ']')
916          {
917          if (*ptr == '\\')
918            {
919            if (*(++ptr) == 0) return -1;
920            if (*ptr == 'Q') for (;;)
921              {
922              while (*(++ptr) != 0 && *ptr != '\\');
923              if (*ptr == 0) return -1;
924              if (*(++ptr) == 'E') break;
925              }
926            continue;
927            }
928          }
929        continue;
930        }
931    
932      /* Skip comments in /x mode */
933    
934      if (xmode && *ptr == '#')
935        {
936        while (*(++ptr) != 0 && *ptr != '\n');
937        if (*ptr == 0) return -1;
938        continue;
939        }
940    
941      /* An opening parens must now be a real metacharacter */
942    
943      if (*ptr != '(') continue;
944      if (ptr[1] != '?')
945        {
946        count++;
947        if (name == NULL && count == lorn) return count;
948        continue;
949        }
950    
951      ptr += 2;
952      if (*ptr == 'P') ptr++;                      /* Allow optional P */
953    
954      /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957           *ptr != '\'')
958        continue;
959    
960      count++;
961    
962      if (name == NULL && count == lorn) return count;
963      term = *ptr++;
964      if (term == '<') term = '>';
965      thisname = ptr;
966      while (*ptr != term) ptr++;
967      if (name != NULL && lorn == ptr - thisname &&
968          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969        return count;
970      }
971    
972    return -1;
973    }
974    
975    
976    
977    /*************************************************
978  *      Find first significant op code            *  *      Find first significant op code            *
979  *************************************************/  *************************************************/
980    
# Line 811  for (;;) Line 1023  for (;;)
1023    
1024      case OP_CALLOUT:      case OP_CALLOUT:
1025      case OP_CREF:      case OP_CREF:
1026      case OP_BRANUMBER:      case OP_RREF:
1027        case OP_DEF:
1028      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1029      break;      break;
1030    
# Line 856  for (;;) Line 1069  for (;;)
1069    {    {
1070    int d;    int d;
1071    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1072    
1073    switch (op)    switch (op)
1074      {      {
1075        case OP_CBRA:
1076      case OP_BRA:      case OP_BRA:
1077      case OP_ONCE:      case OP_ONCE:
1078      case OP_COND:      case OP_COND:
1079      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080      if (d < 0) return d;      if (d < 0) return d;
1081      branchlength += d;      branchlength += d;
1082      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1111  for (;;)
1111      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1112    
1113      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1114      case OP_CREF:      case OP_CREF:
1115        case OP_RREF:
1116        case OP_DEF:
1117      case OP_OPT:      case OP_OPT:
1118      case OP_CALLOUT:      case OP_CALLOUT:
1119      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1131  for (;;)
1131    
1132      case OP_CHAR:      case OP_CHAR:
1133      case OP_CHARNC:      case OP_CHARNC:
1134        case OP_NOT:
1135      branchlength++;      branchlength++;
1136      cc += 2;      cc += 2;
1137  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1031  Returns: pointer to the opcode for Line 1246  Returns: pointer to the opcode for
1246  static const uschar *  static const uschar *
1247  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1248  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1249  for (;;)  for (;;)
1250    {    {
1251    register int c = *code;    register int c = *code;
1252    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1253    else if (c > OP_BRA)  
1254      /* XCLASS is used for classes that cannot be represented just by a bit
1255      map. This includes negated single high-valued characters. The length in
1256      the table is zero; the actual length is stored in the compiled code. */
1257    
1258      if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260      /* Handle capturing bracket */
1261    
1262      else if (c == OP_CBRA)
1263      {      {
1264      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1265      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1266      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1267      }      }
1268    
1269      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270      a multi-byte character. The length in the table is a minimum, so we have to
1271      arrange to skip the extra bytes. */
1272    
1273    else    else
1274      {      {
1275      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1276  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1277      if (utf8) switch(c)      if (utf8) switch(c)
1278        {        {
1279        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1281  for (;;)
1281        case OP_EXACT:        case OP_EXACT:
1282        case OP_UPTO:        case OP_UPTO:
1283        case OP_MINUPTO:        case OP_MINUPTO:
1284          case OP_POSUPTO:
1285        case OP_STAR:        case OP_STAR:
1286        case OP_MINSTAR:        case OP_MINSTAR:
1287          case OP_POSSTAR:
1288        case OP_PLUS:        case OP_PLUS:
1289        case OP_MINPLUS:        case OP_MINPLUS:
1290          case OP_POSPLUS:
1291        case OP_QUERY:        case OP_QUERY:
1292        case OP_MINQUERY:        case OP_MINQUERY:
1293        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1294        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1295        break;        break;
1296        }        }
1297  #endif  #endif
# Line 1105  Returns: pointer to the opcode for Line 1318  Returns: pointer to the opcode for
1318  static const uschar *  static const uschar *
1319  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1320  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1321  for (;;)  for (;;)
1322    {    {
1323    register int c = *code;    register int c = *code;
1324    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1325    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1326    else if (c > OP_BRA)  
1327      {    /* XCLASS is used for classes that cannot be represented just by a bit
1328      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1329      }    the table is zero; the actual length is stored in the compiled code. */
1330    
1331      if (c == OP_XCLASS) code += GET(code, 1);
1332    
1333      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334      that are followed by a character may be followed by a multi-byte character.
1335      The length in the table is a minimum, so we have to arrange to skip the extra
1336      bytes. */
1337    
1338    else    else
1339      {      {
1340      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1341  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1342      if (utf8) switch(c)      if (utf8) switch(c)
1343        {        {
1344        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1346  for (;;)
1346        case OP_EXACT:        case OP_EXACT:
1347        case OP_UPTO:        case OP_UPTO:
1348        case OP_MINUPTO:        case OP_MINUPTO:
1349          case OP_POSUPTO:
1350        case OP_STAR:        case OP_STAR:
1351        case OP_MINSTAR:        case OP_MINSTAR:
1352          case OP_POSSTAR:
1353        case OP_PLUS:        case OP_PLUS:
1354        case OP_MINPLUS:        case OP_MINPLUS:
1355          case OP_POSPLUS:
1356        case OP_QUERY:        case OP_QUERY:
1357        case OP_MINQUERY:        case OP_MINQUERY:
1358        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1359        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1360        break;        break;
1361        }        }
1362  #endif  #endif
# Line 1165  for (;;) Line 1371  for (;;)
1371  *************************************************/  *************************************************/
1372    
1373  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1374  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1375  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1376  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1377  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378    struck an inner bracket whose current branch will already have been scanned.
1379    
1380  Arguments:  Arguments:
1381    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1389  static BOOL
1389  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390  {  {
1391  register int c;  register int c;
1392  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393       code < endcode;       code < endcode;
1394       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395    {    {
# Line 1190  for (code = first_significant_code(code Line 1397  for (code = first_significant_code(code
1397    
1398    c = *code;    c = *code;
1399    
1400    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1401    
1402      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403        {
1404        code += _pcre_OP_lengths[c];
1405        do code += GET(code, 1); while (*code == OP_ALT);
1406        c = *code;
1407        continue;
1408        }
1409    
1410      /* For other groups, scan the branches. */
1411    
1412      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1413      {      {
1414      BOOL empty_branch;      BOOL empty_branch;
1415      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1425  for (code = first_significant_code(code
1425        }        }
1426      while (*code == OP_ALT);      while (*code == OP_ALT);
1427      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1428      c = *code;      c = *code;
1429        continue;
1430      }      }
1431    
1432    else switch (c)    /* Handle the other opcodes */
1433    
1434      switch (c)
1435      {      {
1436      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1437    
# Line 1266  for (code = first_significant_code(code Line 1487  for (code = first_significant_code(code
1487      case OP_NOT:      case OP_NOT:
1488      case OP_PLUS:      case OP_PLUS:
1489      case OP_MINPLUS:      case OP_MINPLUS:
1490        case OP_POSPLUS:
1491      case OP_EXACT:      case OP_EXACT:
1492      case OP_NOTPLUS:      case OP_NOTPLUS:
1493      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1494        case OP_NOTPOSPLUS:
1495      case OP_NOTEXACT:      case OP_NOTEXACT:
1496      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1497      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1498        case OP_TYPEPOSPLUS:
1499      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1500      return FALSE;      return FALSE;
1501    
# Line 1283  for (code = first_significant_code(code Line 1507  for (code = first_significant_code(code
1507      case OP_ALT:      case OP_ALT:
1508      return TRUE;      return TRUE;
1509    
1510      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1512    
1513  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1514      case OP_STAR:      case OP_STAR:
1515      case OP_MINSTAR:      case OP_MINSTAR:
1516        case OP_POSSTAR:
1517      case OP_QUERY:      case OP_QUERY:
1518      case OP_MINQUERY:      case OP_MINQUERY:
1519        case OP_POSQUERY:
1520      case OP_UPTO:      case OP_UPTO:
1521      case OP_MINUPTO:      case OP_MINUPTO:
1522        case OP_POSUPTO:
1523      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524      break;      break;
1525  #endif  #endif
# Line 1410  earlier groups that are outside the curr Line 1637  earlier groups that are outside the curr
1637  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1639  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1640  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1641  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1642    
1643    This function has been extended with the possibility of forward references for
1644    recursions and subroutine calls. It must also check the list of such references
1645    for the group we are dealing with. If it finds that one of the recursions in
1646    the current group is on this list, it adjusts the offset in the list, not the
1647    value in the reference (which is a group number).
1648    
1649  Arguments:  Arguments:
1650    group      points to the start of the group    group      points to the start of the group
1651    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1652    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1653    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1654      save_hwm   the hwm forward reference pointer at the start of the group
1655    
1656  Returns:     nothing  Returns:     nothing
1657  */  */
1658    
1659  static void  static void
1660  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661      uschar *save_hwm)
1662  {  {
1663  uschar *ptr = group;  uschar *ptr = group;
1664  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665    {    {
1666    int offset = GET(ptr, 1);    int offset;
1667    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1668    
1669      /* See if this recursion is on the forward reference list. If so, adjust the
1670      reference. */
1671    
1672      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673        {
1674        offset = GET(hc, 0);
1675        if (cd->start_code + offset == ptr + 1)
1676          {
1677          PUT(hc, 0, offset + adjust);
1678          break;
1679          }
1680        }
1681    
1682      /* Otherwise, adjust the recursion offset if it's after the start of this
1683      group. */
1684    
1685      if (hc >= cd->hwm)
1686        {
1687        offset = GET(ptr, 1);
1688        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689        }
1690    
1691    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1692    }    }
1693  }  }
# Line 1508  Yield: TRUE when range returned; Line 1766  Yield: TRUE when range returned;
1766  */  */
1767    
1768  static BOOL  static BOOL
1769  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770      unsigned int *odptr)
1771  {  {
1772  int c, othercase, next;  unsigned int c, othercase, next;
1773    
1774  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1775    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776    
1777  if (c > d) return FALSE;  if (c > d) return FALSE;
1778    
# Line 1534  return TRUE; Line 1793  return TRUE;
1793  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1794    
1795    
1796    
1797  /*************************************************  /*************************************************
1798  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1799  *************************************************/  *************************************************/
1800    
1801  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1802  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1803  bits.  sense to automatically possessify the repeated item.
1804    
1805  Arguments:  Arguments:
1806    optionsptr     pointer to the option bits    op_code       the repeated op code
1807    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1808    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1809    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1810    errorcodeptr   points to error code variable    ptr           next character in pattern
1811    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1812    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1813    
1814  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1815  */  */
1816    
1817  static BOOL  static BOOL
1818  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1820  {  {
1821  int repeat_type, op_type;  int next;
1822  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1823  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1824  int greedy_default, greedy_non_default;  
1825  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1826  int zeroreqbyte, zerofirstbyte;    {
1827  int req_caseopt, reqvary, tempreqvary;    for (;;)
1828  int condcount = 0;      {
1829  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830  int after_manual_callout = 0;      if (*ptr == '#')
1831  register int c;        {
1832  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1833  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834  BOOL inescq = FALSE;        }
1835  BOOL groupsetfirstbyte = FALSE;      else break;
1836  const uschar *ptr = *ptrptr;      }
1837  const uschar *tempptr;    }
1838  uschar *previous = NULL;  
1839  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1840  uschar classbits[32];  value is a character, a negative value is an escape value. */
1841    
1842    if (*ptr == '\\')
1843      {
1844      int temperrorcode = 0;
1845      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846      if (temperrorcode != 0) return FALSE;
1847      ptr++;    /* Point after the escape sequence */
1848      }
1849    
1850    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851      {
1852  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1853  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1854  #endif  #endif
1855      next = *ptr++;
1856      }
1857    
1858  /* Set up the default and non-default settings for greediness */  else return FALSE;
1859    
1860  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1861    
1862  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1863  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1864  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1865  find one.      {
1866        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867        if (*ptr == '#')
1868          {
1869          while (*(++ptr) != 0)
1870            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871          }
1872        else break;
1873        }
1874      }
1875    
1876  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1877    
1878  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879      return FALSE;
1880    
1881  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1882  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1883  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1884  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1885    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1886    
1887  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1888    
1889  for (;; ptr++)  if (next >= 0) switch(op_code)
1890    {    {
1891    BOOL negate_class;    case OP_CHAR:
1892    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1893    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    int class_charcount;  #endif
1895    int class_lastchar;    return item != next;
   int newoptions;  
   int recno;  
   int skipbytes;  
   int subreqbyte;  
   int subfirstbyte;  
   int mclength;  
   uschar mcbuffer[8];  
1896    
1897    /* Next byte in the pattern */    /* For CHARNC (caseless character) we must check the other case. If we have
1898      Unicode property support, we can use it to test the other case of
1899      high-valued characters. */
1900    
1901    c = *ptr;    case OP_CHARNC:
1902    #ifdef SUPPORT_UTF8
1903      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904    #endif
1905      if (item == next) return FALSE;
1906    #ifdef SUPPORT_UTF8
1907      if (utf8)
1908        {
1909        unsigned int othercase;
1910        if (next < 128) othercase = cd->fcc[next]; else
1911    #ifdef SUPPORT_UCP
1912        othercase = _pcre_ucp_othercase((unsigned int)next);
1913    #else
1914        othercase = NOTACHAR;
1915    #endif
1916        return (unsigned int)item != othercase;
1917        }
1918      else
1919    #endif  /* SUPPORT_UTF8 */
1920      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1921    
1922    /* If in \Q...\E, check for the end; if not, we have a literal */    /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924    if (inescq && c != 0)    case OP_NOT:
1925      if (next < 0) return FALSE;  /* Not a character */
1926      if (item == next) return TRUE;
1927      if ((options & PCRE_CASELESS) == 0) return FALSE;
1928    #ifdef SUPPORT_UTF8
1929      if (utf8)
1930      {      {
1931      if (c == '\\' && ptr[1] == 'E')      unsigned int othercase;
1932        {      if (next < 128) othercase = cd->fcc[next]; else
1933        inescq = FALSE;  #ifdef SUPPORT_UCP
1934        ptr++;      othercase = _pcre_ucp_othercase(next);
1935        continue;  #else
1936        }      othercase = NOTACHAR;
1937    #endif
1938        return (unsigned int)item == othercase;
1939        }
1940      else
1941    #endif  /* SUPPORT_UTF8 */
1942      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1943    
1944      case OP_DIGIT:
1945      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947      case OP_NOT_DIGIT:
1948      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950      case OP_WHITESPACE:
1951      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953      case OP_NOT_WHITESPACE:
1954      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956      case OP_WORDCHAR:
1957      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959      case OP_NOT_WORDCHAR:
1960      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962      case OP_HSPACE:
1963      case OP_NOT_HSPACE:
1964      switch(next)
1965        {
1966        case 0x09:
1967        case 0x20:
1968        case 0xa0:
1969        case 0x1680:
1970        case 0x180e:
1971        case 0x2000:
1972        case 0x2001:
1973        case 0x2002:
1974        case 0x2003:
1975        case 0x2004:
1976        case 0x2005:
1977        case 0x2006:
1978        case 0x2007:
1979        case 0x2008:
1980        case 0x2009:
1981        case 0x200A:
1982        case 0x202f:
1983        case 0x205f:
1984        case 0x3000:
1985        return op_code != OP_HSPACE;
1986        default:
1987        return op_code == OP_HSPACE;
1988        }
1989    
1990      case OP_VSPACE:
1991      case OP_NOT_VSPACE:
1992      switch(next)
1993        {
1994        case 0x0a:
1995        case 0x0b:
1996        case 0x0c:
1997        case 0x0d:
1998        case 0x85:
1999        case 0x2028:
2000        case 0x2029:
2001        return op_code != OP_VSPACE;
2002        default:
2003        return op_code == OP_VSPACE;
2004        }
2005    
2006      default:
2007      return FALSE;
2008      }
2009    
2010    
2011    /* Handle the case when the next item is \d, \s, etc. */
2012    
2013    switch(op_code)
2014      {
2015      case OP_CHAR:
2016      case OP_CHARNC:
2017    #ifdef SUPPORT_UTF8
2018      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019    #endif
2020      switch(-next)
2021        {
2022        case ESC_d:
2023        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025        case ESC_D:
2026        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028        case ESC_s:
2029        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031        case ESC_S:
2032        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034        case ESC_w:
2035        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037        case ESC_W:
2038        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039    
2040        case ESC_h:
2041        case ESC_H:
2042        switch(item)
2043          {
2044          case 0x09:
2045          case 0x20:
2046          case 0xa0:
2047          case 0x1680:
2048          case 0x180e:
2049          case 0x2000:
2050          case 0x2001:
2051          case 0x2002:
2052          case 0x2003:
2053          case 0x2004:
2054          case 0x2005:
2055          case 0x2006:
2056          case 0x2007:
2057          case 0x2008:
2058          case 0x2009:
2059          case 0x200A:
2060          case 0x202f:
2061          case 0x205f:
2062          case 0x3000:
2063          return -next != ESC_h;
2064          default:
2065          return -next == ESC_h;
2066          }
2067    
2068        case ESC_v:
2069        case ESC_V:
2070        switch(item)
2071          {
2072          case 0x0a:
2073          case 0x0b:
2074          case 0x0c:
2075          case 0x0d:
2076          case 0x85:
2077          case 0x2028:
2078          case 0x2029:
2079          return -next != ESC_v;
2080          default:
2081          return -next == ESC_v;
2082          }
2083    
2084        default:
2085        return FALSE;
2086        }
2087    
2088      case OP_DIGIT:
2089      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090             next == -ESC_h || next == -ESC_v;
2091    
2092      case OP_NOT_DIGIT:
2093      return next == -ESC_d;
2094    
2095      case OP_WHITESPACE:
2096      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098      case OP_NOT_WHITESPACE:
2099      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100    
2101      case OP_HSPACE:
2102      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_HSPACE:
2105      return next == -ESC_h;
2106    
2107      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108      case OP_VSPACE:
2109      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111      case OP_NOT_VSPACE:
2112      return next == -ESC_v;
2113    
2114      case OP_WORDCHAR:
2115      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116    
2117      case OP_NOT_WORDCHAR:
2118      return next == -ESC_w || next == -ESC_d;
2119    
2120      default:
2121      return FALSE;
2122      }
2123    
2124    /* Control does not reach here */
2125    }
2126    
2127    
2128    
2129    /*************************************************
2130    *           Compile one branch                   *
2131    *************************************************/
2132    
2133    /* Scan the pattern, compiling it into the a vector. If the options are
2134    changed during the branch, the pointer is used to change the external options
2135    bits. This function is used during the pre-compile phase when we are trying
2136    to find out the amount of memory needed, as well as during the real compile
2137    phase. The value of lengthptr distinguishes the two phases.
2138    
2139    Arguments:
2140      optionsptr     pointer to the option bits
2141      codeptr        points to the pointer to the current code point
2142      ptrptr         points to the current pattern pointer
2143      errorcodeptr   points to error code variable
2144      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145      reqbyteptr     set to the last literal character required, else < 0
2146      bcptr          points to current branch chain
2147      cd             contains pointers to tables etc.
2148      lengthptr      NULL during the real compile phase
2149                     points to length accumulator during pre-compile phase
2150    
2151    Returns:         TRUE on success
2152                     FALSE, with *errorcodeptr set non-zero on error
2153    */
2154    
2155    static BOOL
2156    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158      compile_data *cd, int *lengthptr)
2159    {
2160    int repeat_type, op_type;
2161    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2162    int bravalue = 0;
2163    int greedy_default, greedy_non_default;
2164    int firstbyte, reqbyte;
2165    int zeroreqbyte, zerofirstbyte;
2166    int req_caseopt, reqvary, tempreqvary;
2167    int options = *optionsptr;
2168    int after_manual_callout = 0;
2169    int length_prevgroup = 0;
2170    register int c;
2171    register uschar *code = *codeptr;
2172    uschar *last_code = code;
2173    uschar *orig_code = code;
2174    uschar *tempcode;
2175    BOOL inescq = FALSE;
2176    BOOL groupsetfirstbyte = FALSE;
2177    const uschar *ptr = *ptrptr;
2178    const uschar *tempptr;
2179    uschar *previous = NULL;
2180    uschar *previous_callout = NULL;
2181    uschar *save_hwm = NULL;
2182    uschar classbits[32];
2183    
2184    #ifdef SUPPORT_UTF8
2185    BOOL class_utf8;
2186    BOOL utf8 = (options & PCRE_UTF8) != 0;
2187    uschar *class_utf8data;
2188    uschar utf8_char[6];
2189    #else
2190    BOOL utf8 = FALSE;
2191    uschar *utf8_char = NULL;
2192    #endif
2193    
2194    #ifdef DEBUG
2195    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196    #endif
2197    
2198    /* Set up the default and non-default settings for greediness */
2199    
2200    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201    greedy_non_default = greedy_default ^ 1;
2202    
2203    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205    matches a non-fixed char first char; reqbyte just remains unset if we never
2206    find one.
2207    
2208    When we hit a repeat whose minimum is zero, we may have to adjust these values
2209    to take the zero repeat into account. This is implemented by setting them to
2210    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211    item types that can be repeated set these backoff variables appropriately. */
2212    
2213    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214    
2215    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217    value > 255. It is added into the firstbyte or reqbyte variables to record the
2218    case status of the value. This is used only for ASCII characters. */
2219    
2220    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221    
2222    /* Switch on next character until the end of the branch */
2223    
2224    for (;; ptr++)
2225      {
2226      BOOL negate_class;
2227      BOOL possessive_quantifier;
2228      BOOL is_quantifier;
2229      BOOL is_recurse;
2230      BOOL reset_bracount;
2231      int class_charcount;
2232      int class_lastchar;
2233      int newoptions;
2234      int recno;
2235      int refsign;
2236      int skipbytes;
2237      int subreqbyte;
2238      int subfirstbyte;
2239      int terminator;
2240      int mclength;
2241      uschar mcbuffer[8];
2242    
2243      /* Get next byte in the pattern */
2244    
2245      c = *ptr;
2246    
2247      /* If we are in the pre-compile phase, accumulate the length used for the
2248      previous cycle of this loop. */
2249    
2250      if (lengthptr != NULL)
2251        {
2252    #ifdef DEBUG
2253        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2254    #endif
2255        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256          {
2257          *errorcodeptr = ERR52;
2258          goto FAILED;
2259          }
2260    
2261        /* There is at least one situation where code goes backwards: this is the
2262        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263        the class is simply eliminated. However, it is created first, so we have to
2264        allow memory for it. Therefore, don't ever reduce the length at this point.
2265        */
2266    
2267        if (code < last_code) code = last_code;
2268    
2269        /* Paranoid check for integer overflow */
2270    
2271        if (OFLOW_MAX - *lengthptr < code - last_code)
2272          {
2273          *errorcodeptr = ERR20;
2274          goto FAILED;
2275          }
2276    
2277        *lengthptr += code - last_code;
2278        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280        /* If "previous" is set and it is not at the start of the work space, move
2281        it back to there, in order to avoid filling up the work space. Otherwise,
2282        if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284        if (previous != NULL)
2285          {
2286          if (previous > orig_code)
2287            {
2288            memmove(orig_code, previous, code - previous);
2289            code -= previous - orig_code;
2290            previous = orig_code;
2291            }
2292          }
2293        else code = orig_code;
2294    
2295        /* Remember where this code item starts so we can pick up the length
2296        next time round. */
2297    
2298        last_code = code;
2299        }
2300    
2301      /* In the real compile phase, just check the workspace used by the forward
2302      reference list. */
2303    
2304      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305        {
2306        *errorcodeptr = ERR52;
2307        goto FAILED;
2308        }
2309    
2310      /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312      if (inescq && c != 0)
2313        {
2314        if (c == '\\' && ptr[1] == 'E')
2315          {
2316          inescq = FALSE;
2317          ptr++;
2318          continue;
2319          }
2320      else      else
2321        {        {
2322        if (previous_callout != NULL)        if (previous_callout != NULL)
2323          {          {
2324          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2325              complete_callout(previous_callout, ptr, cd);
2326          previous_callout = NULL;          previous_callout = NULL;
2327          }          }
2328        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2343  for (;; ptr++)
2343    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2344         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2345      {      {
2346      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2347          complete_callout(previous_callout, ptr, cd);
2348      previous_callout = NULL;      previous_callout = NULL;
2349      }      }
2350    
# Line 1683  for (;; ptr++) Line 2355  for (;; ptr++)
2355      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356      if (c == '#')      if (c == '#')
2357        {        {
2358        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2359        on the Macintosh. */          {
2360        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2362          if (*ptr != 0) continue;
2363    
2364          /* Else fall through to handle end of string */
2365          c = 0;
2366        }        }
2367      }      }
2368    
# Line 1700  for (;; ptr++) Line 2376  for (;; ptr++)
2376    
2377    switch(c)    switch(c)
2378      {      {
2379      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2380        case 0:                        /* The branch terminates at string end */
2381      case 0:      case '|':                      /* or | or ) */
     case '|':  
2382      case ')':      case ')':
2383      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2384      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2385      *codeptr = code;      *codeptr = code;
2386      *ptrptr = ptr;      *ptrptr = ptr;
2387        if (lengthptr != NULL)
2388          {
2389          if (OFLOW_MAX - *lengthptr < code - last_code)
2390            {
2391            *errorcodeptr = ERR20;
2392            goto FAILED;
2393            }
2394          *lengthptr += code - last_code;   /* To include callout length */
2395          DPRINTF((">> end branch\n"));
2396          }
2397      return TRUE;      return TRUE;
2398    
2399    
2400        /* ===================================================================*/
2401      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2402      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2403    
# Line 1739  for (;; ptr++) Line 2426  for (;; ptr++)
2426      *code++ = OP_ANY;      *code++ = OP_ANY;
2427      break;      break;
2428    
2429    
2430        /* ===================================================================*/
2431      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2432      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2433      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1777  for (;; ptr++) Line 2466  for (;; ptr++)
2466        }        }
2467    
2468      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2469      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2470      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2471    
2472      class_charcount = 0;      class_charcount = 0;
2473      class_lastchar = -1;      class_lastchar = -1;
2474    
2475        /* Initialize the 32-char bit map to all zeros. We build the map in a
2476        temporary bit of memory, in case the class contains only 1 character (less
2477        than 256), because in that case the compiled code doesn't use the bit map.
2478        */
2479    
2480        memset(classbits, 0, 32 * sizeof(uschar));
2481    
2482  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2483      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2484      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2485  #endif  #endif
2486    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2487      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2488      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2489      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2490    
2491      do      if (c != 0) do
2492        {        {
2493          const uschar *oldptr;
2494    
2495  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2496        if (utf8 && c > 127)        if (utf8 && c > 127)
2497          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1814  for (;; ptr++) Line 2503  for (;; ptr++)
2503    
2504        if (inescq)        if (inescq)
2505          {          {
2506          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2507            {            {
2508            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2509            ptr++;            ptr++;                            /* Skip the 'E' */
2510            continue;            continue;                         /* Carry on with next */
2511            }            }
2512          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2513          }          }
2514    
2515        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1911  for (;; ptr++) Line 2600  for (;; ptr++)
2600          }          }
2601    
2602        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2603        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2604        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2605        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2606        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2607        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2608    
2609        if (c == '\\')        if (c == '\\')
2610          {          {
2611          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612            if (*errorcodeptr != 0) goto FAILED;
2613    
2614          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2615          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2616            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2617          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2618            {            {
2619            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1938  for (;; ptr++) Line 2628  for (;; ptr++)
2628            {            {
2629            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2630            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2631            switch (-c)  
2632              /* Save time by not doing this in the pre-compile phase. */
2633    
2634              if (lengthptr == NULL) switch (-c)
2635              {              {
2636              case ESC_d:              case ESC_d:
2637              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1966  for (;; ptr++) Line 2659  for (;; ptr++)
2659              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2660              continue;              continue;
2661    
2662  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = ptype;  
               *class_utf8data++ = pdata;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2663              continue;              continue;
 #endif  
2664    
2665              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2666              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
             treated as literals. */  
   
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2667              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
2668    
2669          }   /* End of backslash handling */            /* In the pre-compile phase, just do the recognition. */
2670    
2671        /* A single character may be followed by '-' to form a range. However,            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2672        Perl does not permit ']' to be the end of the range. A '-' character                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       here is treated as a literal. */  
2673    
2674        if (ptr[1] == '-' && ptr[2] != ']')            /* We need to deal with \H, \h, \V, and \v in both phases because
2675          {            they use extra memory. */
         int d;  
         ptr += 2;  
2676    
2677              if (-c == ESC_h)
2678                {
2679                SETBIT(classbits, 0x09); /* VT */
2680                SETBIT(classbits, 0x20); /* SPACE */
2681                SETBIT(classbits, 0xa0); /* NSBP */
2682  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2683          if (utf8)              if (utf8)
2684            {                           /* Braces are required because the */                {
2685            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */                class_utf8 = TRUE;
2686            }                *class_utf8data++ = XCL_SINGLE;
2687          else                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2688                  *class_utf8data++ = XCL_SINGLE;
2689                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2690                  *class_utf8data++ = XCL_RANGE;
2691                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2692                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2693                  *class_utf8data++ = XCL_SINGLE;
2694                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2695                  *class_utf8data++ = XCL_SINGLE;
2696                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2697                  *class_utf8data++ = XCL_SINGLE;
2698                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2699                  }
2700  #endif  #endif
2701          d = *ptr;  /* Not UTF-8 mode */              continue;
2702                }
         /* The second part of a range can be a single-character escape, but  
         not any of the other escapes. Perl 5.6 treats a hyphen as a literal  
         in such circumstances. */  
2703    
2704          if (d == '\\')            if (-c == ESC_H)
2705                {
2706                for (c = 0; c < 32; c++)
2707                  {
2708                  int x = 0xff;
2709                  switch (c)
2710                    {
2711                    case 0x09/8: x ^= 1 << (0x09%8); break;
2712                    case 0x20/8: x ^= 1 << (0x20%8); break;
2713                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2714                    default: break;
2715                    }
2716                  classbits[c] |= x;
2717                  }
2718    
2719    #ifdef SUPPORT_UTF8
2720                if (utf8)
2721                  {
2722                  class_utf8 = TRUE;
2723                  *class_utf8data++ = XCL_RANGE;
2724                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2725                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2726                  *class_utf8data++ = XCL_RANGE;
2727                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2728                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2729                  *class_utf8data++ = XCL_RANGE;
2730                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2731                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2732                  *class_utf8data++ = XCL_RANGE;
2733                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2734                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2735                  *class_utf8data++ = XCL_RANGE;
2736                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2737                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2738                  *class_utf8data++ = XCL_RANGE;
2739                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2740                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2741                  *class_utf8data++ = XCL_RANGE;
2742                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2743                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2744                  }
2745    #endif
2746                continue;
2747                }
2748    
2749              if (-c == ESC_v)
2750                {
2751                SETBIT(classbits, 0x0a); /* LF */
2752                SETBIT(classbits, 0x0b); /* VT */
2753                SETBIT(classbits, 0x0c); /* FF */
2754                SETBIT(classbits, 0x0d); /* CR */
2755                SETBIT(classbits, 0x85); /* NEL */
2756    #ifdef SUPPORT_UTF8
2757                if (utf8)
2758                  {
2759                  class_utf8 = TRUE;
2760                  *class_utf8data++ = XCL_RANGE;
2761                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2762                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2763                  }
2764    #endif
2765                continue;
2766                }
2767    
2768              if (-c == ESC_V)
2769                {
2770                for (c = 0; c < 32; c++)
2771                  {
2772                  int x = 0xff;
2773                  switch (c)
2774                    {
2775                    case 0x0a/8: x ^= 1 << (0x0a%8);
2776                                 x ^= 1 << (0x0b%8);
2777                                 x ^= 1 << (0x0c%8);
2778                                 x ^= 1 << (0x0d%8);
2779                                 break;
2780                    case 0x85/8: x ^= 1 << (0x85%8); break;
2781                    default: break;
2782                    }
2783                  classbits[c] |= x;
2784                  }
2785    
2786    #ifdef SUPPORT_UTF8
2787                if (utf8)
2788                  {
2789                  class_utf8 = TRUE;
2790                  *class_utf8data++ = XCL_RANGE;
2791                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2792                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2793                  *class_utf8data++ = XCL_RANGE;
2794                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2795                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2796                  }
2797    #endif
2798                continue;
2799                }
2800    
2801              /* We need to deal with \P and \p in both phases. */
2802    
2803    #ifdef SUPPORT_UCP
2804              if (-c == ESC_p || -c == ESC_P)
2805                {
2806                BOOL negated;
2807                int pdata;
2808                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2809                if (ptype < 0) goto FAILED;
2810                class_utf8 = TRUE;
2811                *class_utf8data++ = ((-c == ESC_p) != negated)?
2812                  XCL_PROP : XCL_NOTPROP;
2813                *class_utf8data++ = ptype;
2814                *class_utf8data++ = pdata;
2815                class_charcount -= 2;   /* Not a < 256 character */
2816                continue;
2817                }
2818    #endif
2819              /* Unrecognized escapes are faulted if PCRE is running in its
2820              strict mode. By default, for compatibility with Perl, they are
2821              treated as literals. */
2822    
2823              if ((options & PCRE_EXTRA) != 0)
2824                {
2825                *errorcodeptr = ERR7;
2826                goto FAILED;
2827                }
2828    
2829              class_charcount -= 2;  /* Undo the default count from above */
2830              c = *ptr;              /* Get the final character and fall through */
2831              }
2832    
2833            /* Fall through if we have a single character (c >= 0). This may be
2834            greater than 256 in UTF-8 mode. */
2835    
2836            }   /* End of backslash handling */
2837    
2838          /* A single character may be followed by '-' to form a range. However,
2839          Perl does not permit ']' to be the end of the range. A '-' character
2840          at the end is treated as a literal. Perl ignores orphaned \E sequences
2841          entirely. The code for handling \Q and \E is messy. */
2842    
2843          CHECK_RANGE:
2844          while (ptr[1] == '\\' && ptr[2] == 'E')
2845            {
2846            inescq = FALSE;
2847            ptr += 2;
2848            }
2849    
2850          oldptr = ptr;
2851    
2852          if (!inescq && ptr[1] == '-')
2853            {
2854            int d;
2855            ptr += 2;
2856            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2857    
2858            /* If we hit \Q (not followed by \E) at this point, go into escaped
2859            mode. */
2860    
2861            while (*ptr == '\\' && ptr[1] == 'Q')
2862              {
2863              ptr += 2;
2864              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2865              inescq = TRUE;
2866              break;
2867              }
2868    
2869            if (*ptr == 0 || (!inescq && *ptr == ']'))
2870              {
2871              ptr = oldptr;
2872              goto LONE_SINGLE_CHARACTER;
2873              }
2874    
2875    #ifdef SUPPORT_UTF8
2876            if (utf8)
2877              {                           /* Braces are required because the */
2878              GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2879              }
2880            else
2881    #endif
2882            d = *ptr;  /* Not UTF-8 mode */
2883    
2884            /* The second part of a range can be a single-character escape, but
2885            not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2886            in such circumstances. */
2887    
2888            if (!inescq && d == '\\')
2889            {            {
2890            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2891            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2892    
2893            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2894            was literal */            special means the '-' was literal */
2895    
2896            if (d < 0)            if (d < 0)
2897              {              {
2898              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2899              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2900                else if (d == -ESC_R) d = 'R'; else
2901                {                {
2902                ptr = oldptr - 2;                ptr = oldptr;
2903                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2904                }                }
2905              }              }
2906            }            }
2907    
2908          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2909          the pre-pass. Optimize one-character ranges */          one-character ranges */
2910    
2911            if (d < c)
2912              {
2913              *errorcodeptr = ERR8;
2914              goto FAILED;
2915              }
2916    
2917          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2918    
# Line 2067  for (;; ptr++) Line 2933  for (;; ptr++)
2933  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2934            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2935              {              {
2936              int occ, ocd;              unsigned int occ, ocd;
2937              int cc = c;              unsigned int cc = c;
2938              int origd = d;              unsigned int origd = d;
2939              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2940                {                {
2941                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2942                      ocd <= (unsigned int)d)
2943                    continue;                          /* Skip embedded ranges */
2944    
2945                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2946                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2947                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2948                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2949                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2950                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2951                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2952                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2953                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2954                  d = ocd;                  d = ocd;
2955                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 2997  for (;; ptr++)
2997          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2998          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2999    
3000          for (; c <= d; c++)          class_charcount += d - c + 1;
3001            class_lastchar = d;
3002    
3003            /* We can save a bit of time by skipping this in the pre-compile. */
3004    
3005            if (lengthptr == NULL) for (; c <= d; c++)
3006            {            {
3007            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3008            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3010  for (;; ptr++)
3010              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3011              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3012              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3013            }            }
3014    
3015          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3033  for (;; ptr++)
3033  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3034          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3035            {            {
3036            int othercase;            unsigned int othercase;
3037            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3038              {              {
3039              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3040              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3059  for (;; ptr++)
3059          }          }
3060        }        }
3061    
3062      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3063    
3064      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3065    
3066        if (c == 0)                          /* Missing terminating ']' */
3067          {
3068          *errorcodeptr = ERR6;
3069          goto FAILED;
3070          }
3071    
3072      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3073      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2253  for (;; ptr++) Line 3131  for (;; ptr++)
3131    
3132      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3133      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3134      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3135    
3136  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3137      if (class_utf8)      if (class_utf8)
# Line 2263  for (;; ptr++) Line 3141  for (;; ptr++)
3141        code += LINK_SIZE;        code += LINK_SIZE;
3142        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3143    
3144        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3145        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3146    
3147        if (class_charcount > 0)        if (class_charcount > 0)
3148          {          {
3149          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3150            memmove(code + 32, code, class_utf8data - code);
3151          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3152          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3153          }          }
3154          else code = class_utf8data;
3155    
3156        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3157    
# Line 2297  for (;; ptr++) Line 3168  for (;; ptr++)
3168      if (negate_class)      if (negate_class)
3169        {        {
3170        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3171        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3172            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3173        }        }
3174      else      else
3175        {        {
# Line 2307  for (;; ptr++) Line 3179  for (;; ptr++)
3179      code += 32;      code += 32;
3180      break;      break;
3181    
3182    
3183        /* ===================================================================*/
3184      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3185      has been tested above. */      has been tested above. */
3186    
# Line 2374  for (;; ptr++) Line 3248  for (;; ptr++)
3248        }        }
3249      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3250    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3251      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3252      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3253      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3281  for (;; ptr++)
3281          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3282          }          }
3283    
3284          /* If the repetition is unlimited, it pays to see if the next thing on
3285          the line is something that cannot possibly match this character. If so,
3286          automatically possessifying this item gains some performance in the case
3287          where the match fails. */
3288    
3289          if (!possessive_quantifier &&
3290              repeat_max < 0 &&
3291              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3292                options, cd))
3293            {
3294            repeat_type = 0;    /* Force greedy */
3295            possessive_quantifier = TRUE;
3296            }
3297    
3298        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3299        }        }
3300    
3301      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3302      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3303      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3304      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3305        currently used only for single-byte chars. */
3306    
3307      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3308        {        {
3309        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3310        c = previous[1];        c = previous[1];
3311          if (!possessive_quantifier &&
3312              repeat_max < 0 &&
3313              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3314            {
3315            repeat_type = 0;    /* Force greedy */
3316            possessive_quantifier = TRUE;
3317            }
3318        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3319        }        }
3320    
# Line 2450  for (;; ptr++) Line 3332  for (;; ptr++)
3332        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3333        c = *previous;        c = *previous;
3334    
3335          if (!possessive_quantifier &&
3336              repeat_max < 0 &&
3337              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3338            {
3339            repeat_type = 0;    /* Force greedy */
3340            possessive_quantifier = TRUE;
3341            }
3342    
3343        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3344        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3345          {          {
# Line 2490  for (;; ptr++) Line 3380  for (;; ptr++)
3380          }          }
3381    
3382        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3383        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3384        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3385        one less than the maximum. */        one less than the maximum. */
3386    
# Line 2543  for (;; ptr++) Line 3433  for (;; ptr++)
3433            }            }
3434    
3435          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3436          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3437            UPTO is just for 1 instance, we can use QUERY instead. */
3438    
3439          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3440            {            {
# Line 2562  for (;; ptr++) Line 3453  for (;; ptr++)
3453              *code++ = prop_value;              *code++ = prop_value;
3454              }              }
3455            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3456            *code++ = OP_UPTO + repeat_type;  
3457            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3458                {
3459                *code++ = OP_QUERY + repeat_type;
3460                }
3461              else
3462                {
3463                *code++ = OP_UPTO + repeat_type;
3464                PUT2INC(code, 0, repeat_max);
3465                }
3466            }            }
3467          }          }
3468    
# Line 2630  for (;; ptr++) Line 3529  for (;; ptr++)
3529      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3530      cases. */      cases. */
3531    
3532      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3533               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3534        {        {
3535        register int i;        register int i;
3536        int ketoffset = 0;        int ketoffset = 0;
3537        int len = code - previous;        int len = code - previous;
3538        uschar *bralink = NULL;        uschar *bralink = NULL;
3539    
3540          /* Repeating a DEFINE group is pointless */
3541    
3542          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3543            {
3544            *errorcodeptr = ERR55;
3545            goto FAILED;
3546            }
3547    
3548        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3549        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3550        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2672  for (;; ptr++) Line 3579  for (;; ptr++)
3579          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3580          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3581          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3582          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3583          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3584            doing this. */
3585    
3586          if (repeat_max <= 1)          if (repeat_max <= 1)
3587            {            {
3588            *code = OP_END;            *code = OP_END;
3589            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3590            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3591            code++;            code++;
3592            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2696  for (;; ptr++) Line 3604  for (;; ptr++)
3604            {            {
3605            int offset;            int offset;
3606            *code = OP_END;            *code = OP_END;
3607            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3608            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3609            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3610            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3624  for (;; ptr++)
3624        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3625        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3626        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3627        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3628          forward reference subroutine calls in the group, there will be entries on
3629          the workspace list; replicate these with an appropriate increment. */
3630    
3631        else        else
3632          {          {
3633          if (repeat_min > 1)          if (repeat_min > 1)
3634            {            {
3635            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3636            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3637              potential integer overflow. */
3638    
3639              if (lengthptr != NULL)
3640                {
3641                int delta = (repeat_min - 1)*length_prevgroup;
3642                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3643                                                                (double)INT_MAX ||
3644                    OFLOW_MAX - *lengthptr < delta)
3645                  {
3646                  *errorcodeptr = ERR20;
3647                  goto FAILED;
3648                  }
3649                *lengthptr += delta;
3650                }
3651    
3652              /* This is compiling for real */
3653    
3654              else
3655              {              {
3656              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3657              code += len;              for (i = 1; i < repeat_min; i++)
3658                  {
3659                  uschar *hc;
3660                  uschar *this_hwm = cd->hwm;
3661                  memcpy(code, previous, len);
3662                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3663                    {
3664                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3665                    cd->hwm += LINK_SIZE;
3666                    }
3667                  save_hwm = this_hwm;
3668                  code += len;
3669                  }
3670              }              }
3671            }            }
3672    
3673          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3674          }          }
3675    
# Line 2736  for (;; ptr++) Line 3677  for (;; ptr++)
3677        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3678        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3679        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3680        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3681          replicate entries on the forward reference list. */
3682    
3683        if (repeat_max >= 0)        if (repeat_max >= 0)
3684          {          {
3685          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3686            just adjust the length as if we had. For each repetition we must add 1
3687            to the length for BRAZERO and for all but the last repetition we must
3688            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3689            paranoid checks to avoid integer overflow. */
3690    
3691            if (lengthptr != NULL && repeat_max > 0)
3692              {
3693              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3694                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3695              if ((double)repeat_max *
3696                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3697                      > (double)INT_MAX ||
3698                  OFLOW_MAX - *lengthptr < delta)
3699                {
3700                *errorcodeptr = ERR20;
3701                goto FAILED;
3702                }
3703              *lengthptr += delta;
3704              }
3705    
3706            /* This is compiling for real */
3707    
3708            else for (i = repeat_max - 1; i >= 0; i--)
3709            {            {
3710              uschar *hc;
3711              uschar *this_hwm = cd->hwm;
3712    
3713            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3714    
3715            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 3725  for (;; ptr++)
3725              }              }
3726    
3727            memcpy(code, previous, len);            memcpy(code, previous, len);
3728              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3729                {
3730                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3731                cd->hwm += LINK_SIZE;
3732                }
3733              save_hwm = this_hwm;
3734            code += len;            code += len;
3735            }            }
3736    
# Line 2779  for (;; ptr++) Line 3753  for (;; ptr++)
3753        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3754        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3755        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3756        correct offset was computed above. */        correct offset was computed above.
3757    
3758          Then, when we are doing the actual compile phase, check to see whether
3759          this group is a non-atomic one that could match an empty string. If so,
3760          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3761          that runtime checking can be done. [This check is also applied to
3762          atomic groups at runtime, but in a different way.] */
3763    
3764        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3765            {
3766            uschar *ketcode = code - ketoffset;
3767            uschar *bracode = ketcode - GET(ketcode, 1);
3768            *ketcode = OP_KETRMAX + repeat_type;
3769            if (lengthptr == NULL && *bracode != OP_ONCE)
3770              {
3771              uschar *scode = bracode;
3772              do
3773                {
3774                if (could_be_empty_branch(scode, ketcode, utf8))
3775                  {
3776                  *bracode += OP_SBRA - OP_BRA;
3777                  break;
3778                  }
3779                scode += GET(scode, 1);
3780                }
3781              while (*scode == OP_ALT);
3782              }
3783            }
3784        }        }
3785    
3786      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 3791  for (;; ptr++)
3791        goto FAILED;        goto FAILED;
3792        }        }
3793    
3794      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3795      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3796      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3797      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3798      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3799        but the special opcodes can optimize it a bit. The repeated item starts at
3800        tempcode, not at previous, which might be the first part of a string whose
3801        (former) last char we repeated.
3802    
3803        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3804        an 'upto' may follow. We skip over an 'exact' item, and then test the
3805        length of what remains before proceeding. */
3806    
3807      if (possessive_quantifier)      if (possessive_quantifier)
3808        {        {
3809        int len = code - tempcode;        int len;
3810        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3811        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3812        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3813        tempcode[0] = OP_ONCE;        len = code - tempcode;
3814        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3815        PUTINC(code, 0, len);          {
3816        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3817            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3818            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3819            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3820    
3821            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3822            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3823            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3824            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3825    
3826            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3827            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3828            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3829            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3830    
3831            default:
3832            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3833            code += 1 + LINK_SIZE;
3834            len += 1 + LINK_SIZE;
3835            tempcode[0] = OP_ONCE;
3836            *code++ = OP_KET;
3837            PUTINC(code, 0, len);
3838            PUT(tempcode, 1, len);
3839            break;
3840            }
3841        }        }
3842    
3843      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 3850  for (;; ptr++)
3850      break;      break;
3851    
3852    
3853      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3854      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3855      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3856      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3857      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3858      check for syntax errors here.  */      group. */
3859    
3860      case '(':      case '(':
3861      newoptions = options;      newoptions = options;
3862      skipbytes = 0;      skipbytes = 0;
3863        bravalue = OP_CBRA;
3864        save_hwm = cd->hwm;
3865        reset_bracount = FALSE;
3866    
3867      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3868        {        {
3869        int set, unset;        int i, set, unset, namelen;
3870        int *optset;        int *optset;
3871          const uschar *name;
3872          uschar *slot;
3873    
3874        switch (*(++ptr))        switch (*(++ptr))
3875          {          {
3876          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3877          ptr++;          ptr++;
3878          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3879            if (*ptr == 0)
3880              {
3881              *errorcodeptr = ERR18;
3882              goto FAILED;
3883              }
3884          continue;          continue;
3885    
3886          case ':':                 /* Non-extracting bracket */  
3887            /* ------------------------------------------------------------ */
3888            case '|':                 /* Reset capture count for each branch */
3889            reset_bracount = TRUE;
3890            /* Fall through */
3891    
3892            /* ------------------------------------------------------------ */
3893            case ':':                 /* Non-capturing bracket */
3894          bravalue = OP_BRA;          bravalue = OP_BRA;
3895          ptr++;          ptr++;
3896          break;          break;
3897    
3898    
3899            /* ------------------------------------------------------------ */
3900          case '(':          case '(':
3901          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3902    
3903          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3904            group), a name (referring to a named group), or 'R', referring to
3905            recursion. R<digits> and R&name are also permitted for recursion tests.
3906    
3907            There are several syntaxes for testing a named group: (?(name)) is used
3908            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3909    
3910            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3911            be the recursive thing or the name 'R' (and similarly for 'R' followed
3912            by digits), and (b) a number could be a name that consists of digits.
3913            In both cases, we look for a name first; if not found, we try the other
3914            cases. */
3915    
3916            /* For conditions that are assertions, check the syntax, and then exit
3917            the switch. This will take control down to where bracketed groups,
3918            including assertions, are processed. */
3919    
3920          if (ptr[1] == 'R')          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3921              break;
3922    
3923            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3924            below), and all need to skip 3 bytes at the start of the group. */
3925    
3926            code[1+LINK_SIZE] = OP_CREF;
3927            skipbytes = 3;
3928            refsign = -1;
3929    
3930            /* Check for a test for recursion in a named group. */
3931    
3932            if (ptr[1] == 'R' && ptr[2] == '&')
3933            {            {
3934            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
3935            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
3936            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
3937            }            }
3938    
3939          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
3940          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
3941    
3942          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
3943            {            {
3944            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
3945            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
3946            }            }
3947          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
3948          set bravalue above. */            {
3949          break;            terminator = '\'';
3950              ptr++;
3951          case '=':                 /* Positive lookahead */            }
3952          bravalue = OP_ASSERT;          else
3953          ptr++;            {
3954          break;            terminator = 0;
3955              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3956              }
3957    
3958          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
3959    
3960          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
3961            {            {
3962            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
3963            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
3964            ptr++;            goto FAILED;
3965            break;            }
3966    
3967            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
3968            bravalue = OP_ASSERTBACK_NOT;  
3969            recno = 0;
3970            name = ++ptr;
3971            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3972              {
3973              if (recno >= 0)
3974                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3975                  recno * 10 + *ptr - '0' : -1;
3976            ptr++;            ptr++;
           break;  
3977            }            }
3978          break;          namelen = ptr - name;
3979    
3980          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3981          bravalue = OP_ONCE;            {
3982          ptr++;            ptr--;      /* Error offset */
3983          break;            *errorcodeptr = ERR26;
3984              goto FAILED;
3985              }
3986    
3987          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
3988          previous_callout = code;  /* Save for later completion */  
3989          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
3990          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
3991            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
3992            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
3993            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
3994              n = n * 10 + *ptr - '0';  
3995            if (n > 255)          if (refsign > 0)
3996              {
3997              if (recno <= 0)
3998              {              {
3999              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4000              goto FAILED;              goto FAILED;
4001              }              }
4002            *code++ = n;            if (refsign == '-')
4003            PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */              {
4004            PUT(code, LINK_SIZE, 0);                    /* Default length */              recno = cd->bracount - recno + 1;
4005            code += 2 * LINK_SIZE;              if (recno <= 0)
4006                  {
4007                  *errorcodeptr = ERR15;
4008                  goto FAILED;
4009                  }
4010                }
4011              else recno += cd->bracount;
4012              PUT2(code, 2+LINK_SIZE, recno);
4013              break;
4014            }            }
         previous = NULL;  
         continue;  
4015    
4016          case 'P':                 /* Named subpattern handling */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4017          if (*(++ptr) == '<')      /* Definition */          name. */
4018    
4019            slot = cd->name_table;
4020            for (i = 0; i < cd->names_found; i++)
4021            {            {
4022            int i, namelen;            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4023            uschar *slot = cd->name_table;            slot += cd->name_entry_size;
4024            const uschar *name;     /* Don't amalgamate; some compilers */            }
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4025    
4026            while (*ptr++ != '>');          /* Found a previous named subpattern */
           namelen = ptr - name - 1;  
4027    
4028            for (i = 0; i < cd->names_found; i++)          if (i < cd->names_found)
4029              {            {
4030              int crc = memcmp(name, slot+2, namelen);            recno = GET2(slot, 0);
4031              if (crc == 0)            PUT2(code, 2+LINK_SIZE, recno);
4032                {            }
4033                if (slot[2+namelen] == 0)  
4034            /* Search the pattern for a forward reference */
4035    
4036            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4037                            (options & PCRE_EXTENDED) != 0)) > 0)
4038              {
4039              PUT2(code, 2+LINK_SIZE, i);
4040              }
4041    
4042            /* If terminator == 0 it means that the name followed directly after
4043            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4044            some further alternatives to try. For the cases where terminator != 0
4045            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4046            now checked all the possibilities, so give an error. */
4047    
4048            else if (terminator != 0)
4049              {
4050              *errorcodeptr = ERR15;
4051              goto FAILED;
4052              }
4053    
4054            /* Check for (?(R) for recursion. Allow digits after R to specify a
4055            specific group number. */
4056    
4057            else if (*name == 'R')
4058              {
4059              recno = 0;
4060              for (i = 1; i < namelen; i++)
4061                {
4062                if ((digitab[name[i]] & ctype_digit) == 0)
4063                  {
4064                  *errorcodeptr = ERR15;
4065                  goto FAILED;
4066                  }
4067                recno = recno * 10 + name[i] - '0';
4068                }
4069              if (recno == 0) recno = RREF_ANY;
4070              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4071              PUT2(code, 2+LINK_SIZE, recno);
4072              }
4073    
4074            /* Similarly, check for the (?(DEFINE) "condition", which is always
4075            false. */
4076    
4077            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4078              {
4079              code[1+LINK_SIZE] = OP_DEF;
4080              skipbytes = 1;
4081              }
4082    
4083            /* Check for the "name" actually being a subpattern number. */
4084    
4085            else if (recno > 0)
4086              {
4087              PUT2(code, 2+LINK_SIZE, recno);
4088              }
4089    
4090            /* Either an unidentified subpattern, or a reference to (?(0) */
4091    
4092            else
4093              {
4094              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4095              goto FAILED;
4096              }
4097            break;
4098    
4099    
4100            /* ------------------------------------------------------------ */
4101            case '=':                 /* Positive lookahead */
4102            bravalue = OP_ASSERT;
4103            ptr++;
4104            break;
4105    
4106    
4107            /* ------------------------------------------------------------ */
4108            case '!':                 /* Negative lookahead */
4109            bravalue = OP_ASSERT_NOT;
4110            ptr++;
4111            break;
4112    
4113    
4114            /* ------------------------------------------------------------ */
4115            case '<':                 /* Lookbehind or named define */
4116            switch (ptr[1])
4117              {
4118              case '=':               /* Positive lookbehind */
4119              bravalue = OP_ASSERTBACK;
4120              ptr += 2;
4121              break;
4122    
4123              case '!':               /* Negative lookbehind */
4124              bravalue = OP_ASSERTBACK_NOT;
4125              ptr += 2;
4126              break;
4127    
4128              default:                /* Could be name define, else bad */
4129              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4130              ptr++;                  /* Correct offset for error */
4131              *errorcodeptr = ERR24;
4132              goto FAILED;
4133              }
4134            break;
4135    
4136    
4137            /* ------------------------------------------------------------ */
4138            case '>':                 /* One-time brackets */
4139            bravalue = OP_ONCE;
4140            ptr++;
4141            break;
4142    
4143    
4144            /* ------------------------------------------------------------ */
4145            case 'C':                 /* Callout - may be followed by digits; */
4146            previous_callout = code;  /* Save for later completion */
4147            after_manual_callout = 1; /* Skip one item before completing */
4148            *code++ = OP_CALLOUT;
4149              {
4150              int n = 0;
4151              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4152                n = n * 10 + *ptr - '0';
4153              if (*ptr != ')')
4154                {
4155                *errorcodeptr = ERR39;
4156                goto FAILED;
4157                }
4158              if (n > 255)
4159                {
4160                *errorcodeptr = ERR38;
4161                goto FAILED;
4162                }
4163              *code++ = n;
4164              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4165              PUT(code, LINK_SIZE, 0);                    /* Default length */
4166              code += 2 * LINK_SIZE;
4167              }
4168            previous = NULL;
4169            continue;
4170    
4171    
4172            /* ------------------------------------------------------------ */
4173            case 'P':                 /* Python-style named subpattern handling */
4174            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4175              {
4176              is_recurse = *ptr == '>';
4177              terminator = ')';
4178              goto NAMED_REF_OR_RECURSE;
4179              }
4180            else if (*ptr != '<')    /* Test for Python-style definition */
4181              {
4182              *errorcodeptr = ERR41;
4183              goto FAILED;
4184              }
4185            /* Fall through to handle (?P< as (?< is handled */
4186    
4187    
4188            /* ------------------------------------------------------------ */
4189            DEFINE_NAME:    /* Come here from (?< handling */
4190            case '\'':
4191              {
4192              terminator = (*ptr == '<')? '>' : '\'';
4193              name = ++ptr;
4194    
4195              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4196              namelen = ptr - name;
4197    
4198              /* In the pre-compile phase, just do a syntax check. */
4199    
4200              if (lengthptr != NULL)
4201                {
4202                if (*ptr != terminator)
4203                  {
4204                  *errorcodeptr = ERR42;
4205                  goto FAILED;
4206                  }
4207                if (cd->names_found >= MAX_NAME_COUNT)
4208                  {
4209                  *errorcodeptr = ERR49;
4210                  goto FAILED;
4211                  }
4212                if (namelen + 3 > cd->name_entry_size)
4213                  {
4214                  cd->name_entry_size = namelen + 3;
4215                  if (namelen > MAX_NAME_SIZE)
4216                  {                  {
4217                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4218                  goto FAILED;                  goto FAILED;
4219                  }                  }
               crc = -1;             /* Current name is substring */  
4220                }                }
4221              if (crc < 0)              }
4222    
4223              /* In the real compile, create the entry in the table */
4224    
4225              else
4226                {
4227                slot = cd->name_table;
4228                for (i = 0; i < cd->names_found; i++)
4229                {                {
4230                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4231                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4232                break;                  {
4233                    if (slot[2+namelen] == 0)
4234                      {
4235                      if ((options & PCRE_DUPNAMES) == 0)
4236                        {
4237                        *errorcodeptr = ERR43;
4238                        goto FAILED;
4239                        }
4240                      }
4241                    else crc = -1;      /* Current name is substring */
4242                    }
4243                  if (crc < 0)
4244                    {
4245                    memmove(slot + cd->name_entry_size, slot,
4246                      (cd->names_found - i) * cd->name_entry_size);
4247                    break;
4248                    }
4249                  slot += cd->name_entry_size;
4250                }                }
             slot += cd->name_entry_size;  
             }  
4251    
4252            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4253            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4254            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4255            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4256            }            }
4257    
4258          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4259    
4260            ptr++;                    /* Move past > or ' */
4261            cd->names_found++;
4262            goto NUMBERED_GROUP;
4263    
4264    
4265            /* ------------------------------------------------------------ */
4266            case '&':                 /* Perl recursion/subroutine syntax */
4267            terminator = ')';
4268            is_recurse = TRUE;
4269            /* Fall through */
4270    
4271            /* We come here from the Python syntax above that handles both
4272            references (?P=name) and recursion (?P>name), as well as falling
4273            through from the Perl recursion syntax (?&name). */
4274    
4275            NAMED_REF_OR_RECURSE:
4276            name = ++ptr;
4277            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4278            namelen = ptr - name;
4279    
4280            /* In the pre-compile phase, do a syntax check and set a dummy
4281            reference number. */
4282    
4283            if (lengthptr != NULL)
4284            {            {
4285            int i, namelen;            if (*ptr != terminator)
4286            int type = *ptr++;              {
4287            const uschar *name = ptr;              *errorcodeptr = ERR42;
4288            uschar *slot = cd->name_table;              goto FAILED;
4289                }
4290              if (namelen > MAX_NAME_SIZE)
4291                {
4292                *errorcodeptr = ERR48;
4293                goto FAILED;
4294                }
4295              recno = 0;
4296              }
4297    
4298            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4299    
4300            else
4301              {
4302              slot = cd->name_table;
4303            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4304              {              {
4305              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4306              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4307              }              }
4308            if (i >= cd->names_found)  
4309              if (i < cd->names_found)         /* Back reference */
4310                {
4311                recno = GET2(slot, 0);
4312                }
4313              else if ((recno =                /* Forward back reference */
4314                        find_parens(ptr, cd->bracount, name, namelen,
4315                          (options & PCRE_EXTENDED) != 0)) <= 0)
4316              {              {
4317              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4318              goto FAILED;              goto FAILED;
4319              }              }
4320              }
4321    
4322            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4323            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4324    
4325            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4326            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4327    
         /* Should never happen */  
         break;  
4328    
4329          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4330            case 'R':                 /* Recursion */
4331          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4332          /* Fall through */          /* Fall through */
4333    
         /* Recursion or "subroutine" call */  
4334    
4335          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4336          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4337            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4338            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4339            {            {
4340            const uschar *called;            const uschar *called;
4341    
4342              if ((refsign = *ptr) == '+') ptr++;
4343              else if (refsign == '-')
4344                {
4345                if ((digitab[ptr[1]] & ctype_digit) == 0)
4346                  goto OTHER_CHAR_AFTER_QUERY;
4347                ptr++;
4348                }
4349    
4350            recno = 0;            recno = 0;
4351            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4352              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4353    
4354              if (*ptr != ')')
4355                {
4356                *errorcodeptr = ERR29;
4357                goto FAILED;
4358                }
4359    
4360              if (refsign == '-')
4361                {
4362                if (recno == 0)
4363                  {
4364                  *errorcodeptr = ERR58;
4365                  goto FAILED;
4366                  }
4367                recno = cd->bracount - recno + 1;
4368                if (recno <= 0)
4369                  {
4370                  *errorcodeptr = ERR15;
4371                  goto FAILED;
4372                  }
4373                }
4374              else if (refsign == '+')
4375                {
4376                if (recno == 0)
4377                  {
4378                  *errorcodeptr = ERR58;
4379                  goto FAILED;
4380                  }
4381                recno += cd->bracount;
4382                }
4383    
4384            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4385    
4386            HANDLE_RECURSION:            HANDLE_RECURSION:
4387    
4388            previous = code;            previous = code;
4389              called = cd->start_code;
4390    
4391            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4392            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4393              this point. If we end up with a forward reference, first check that
4394              the bracket does occur later so we can give the error (and position)
4395              now. Then remember this forward reference in the workspace so it can
4396              be filled in at the end. */
4397    
4398            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4399              {              {
4400              *errorcodeptr = ERR15;              *code = OP_END;
4401              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4402    
4403            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4404    
4405            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4406              {                {
4407              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4408              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4409                    {
4410                    *errorcodeptr = ERR15;
4411                    goto FAILED;
4412                    }
4413                  called = cd->start_code + recno;
4414                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4415                  }
4416    
4417                /* If not a forward reference, and the subpattern is still open,
4418                this is a recursive call. We check to see if this is a left
4419                recursion that could loop for ever, and diagnose that case. */
4420    
4421                else if (GET(called, 1) == 0 &&
4422                         could_be_empty(called, code, bcptr, utf8))
4423                  {
4424                  *errorcodeptr = ERR40;
4425                  goto FAILED;
4426                  }
4427              }              }
4428    
4429            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4430            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4431              subsequent quantifier will work. */
4432    
4433            *code = OP_ONCE;            *code = OP_ONCE;
4434            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3069  for (;; ptr++) Line 4441  for (;; ptr++)
4441            *code = OP_KET;            *code = OP_KET;
4442            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4443            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4444    
4445              length_prevgroup = 3 + 3*LINK_SIZE;
4446            }            }
4447    
4448            /* Can't determine a first byte now */
4449    
4450            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4451          continue;          continue;
4452    
         /* Character after (? not specially recognized */  
4453    
4454          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4455            default:              /* Other characters: check option setting */
4456            OTHER_CHAR_AFTER_QUERY:
4457          set = unset = 0;          set = unset = 0;
4458          optset = &set;          optset = &set;
4459    
# Line 3084  for (;; ptr++) Line 4463  for (;; ptr++)
4463              {              {
4464              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4465    
4466                case 'J':    /* Record that it changed in the external options */
4467                *optset |= PCRE_DUPNAMES;
4468                cd->external_options |= PCRE_JCHANGED;
4469                break;
4470    
4471              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4472              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4473              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4474              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4475              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4476              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4477    
4478                default:  *errorcodeptr = ERR12;
4479                          ptr--;    /* Correct the offset */
4480                          goto FAILED;
4481              }              }
4482            }            }
4483    
# Line 3098  for (;; ptr++) Line 4486  for (;; ptr++)
4486          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4487    
4488          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4489          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4490          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4491          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4492          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4493          a group), a resetting item can be compiled.          caseless checking of required bytes.
4494    
4495          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4496          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4497          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4498            that value after the start, because it gets reset as code is discarded
4499            during the pre-compile. However, this can happen only at top level - if
4500            we are within parentheses, the starting BRA will still be present. At
4501            any parenthesis level, the length value can be used to test if anything
4502            has been compiled at that level. Thus, a test for both these conditions
4503            is necessary to ensure we correctly detect the start of the pattern in
4504            both phases.
4505    
4506            If we are not at the pattern start, compile code to change the ims
4507            options if this setting actually changes any of them. We also pass the
4508            new setting back so that it can be put at the start of any following
4509            branches, and when this group ends (if we are in a group), a resetting
4510            item can be compiled. */
4511    
4512          if (*ptr == ')')          if (*ptr == ')')
4513            {            {
4514            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4515                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4516              {              {
4517              *code++ = OP_OPT;              cd->external_options = newoptions;
4518              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4519              }              }
4520             else
4521                {
4522                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4523                  {
4524                  *code++ = OP_OPT;
4525                  *code++ = newoptions & PCRE_IMS;
4526                  }
4527    
4528            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4529            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4530            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4531    
4532            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4533            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4534            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4535            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4536                }
4537    
4538            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4539            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3136  for (;; ptr++) Line 4546  for (;; ptr++)
4546    
4547          bravalue = OP_BRA;          bravalue = OP_BRA;
4548          ptr++;          ptr++;
4549          }          }     /* End of switch for character following (? */
4550        }        }       /* End of (? handling */
4551    
4552      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4553      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4554        brackets. */
4555    
4556      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4557        {        {
4558        bravalue = OP_BRA;        bravalue = OP_BRA;
4559        }        }
4560    
4561      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4562    
4563      else      else
4564        {        {
4565        NUMBERED_GROUP:        NUMBERED_GROUP:
4566        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4567          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4568          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4569        }        }
4570    
4571      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4572      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4573      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4574      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4575        they have changed. */
4576    
4577      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4578      *code = bravalue;      *code = bravalue;
4579      tempcode = code;      tempcode = code;
4580      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4581        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4582    
4583      if (!compile_regex(      if (!compile_regex(
4584           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4585           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4586           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4587           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4588           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4589           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4590            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4591           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4592             skipbytes,                    /* Skip over bracket number */
4593           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4594           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4595           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4596           cd))                          /* Tables block */           cd,                           /* Tables block */
4597             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4598               &length_prevgroup           /* Pre-compile phase */
4599             ))
4600        goto FAILED;        goto FAILED;
4601    
4602      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3196  for (;; ptr++) Line 4605  for (;; ptr++)
4605      is on the bracket. */      is on the bracket. */
4606    
4607      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4608      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4609        in the real compile phase, not in the pre-pass, where the whole group may
4610        not be available. */
4611    
4612      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4613        {        {
4614        uschar *tc = code;        uschar *tc = code;
4615        condcount = 0;        int condcount = 0;
4616    
4617        do {        do {
4618           condcount++;           condcount++;
# Line 3209  for (;; ptr++) Line 4620  for (;; ptr++)
4620           }           }
4621        while (*tc != OP_KET);        while (*tc != OP_KET);
4622    
4623        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4624          false). It must have only one branch. */
4625    
4626          if (code[LINK_SIZE+1] == OP_DEF)
4627          {          {
4628          *errorcodeptr = ERR27;          if (condcount > 1)
4629          goto FAILED;            {
4630              *errorcodeptr = ERR54;
4631              goto FAILED;
4632              }
4633            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4634            }
4635    
4636          /* A "normal" conditional group. If there is just one branch, we must not
4637          make use of its firstbyte or reqbyte, because this is equivalent to an
4638          empty second branch. */
4639    
4640          else
4641            {
4642            if (condcount > 2)
4643              {
4644              *errorcodeptr = ERR27;
4645              goto FAILED;
4646              }
4647            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4648          }          }
4649          }
4650    
4651        /* Error if hit end of pattern */
4652    
4653        if (*ptr != ')')
4654          {
4655          *errorcodeptr = ERR14;
4656          goto FAILED;
4657          }
4658    
4659        /* If there is just one branch, we must not make use of its firstbyte or      /* In the pre-compile phase, update the length by the length of the nested
4660        reqbyte, because this is equivalent to an empty second branch. */      group, less the brackets at either end. Then reduce the compiled code to
4661        just the brackets so that it doesn't use much memory if it is duplicated by
4662        a quantifier. */
4663    
4664        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      if (lengthptr != NULL)
4665          {
4666          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4667            {
4668            *errorcodeptr = ERR20;
4669            goto FAILED;
4670            }
4671          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4672          code++;
4673          PUTINC(code, 0, 1 + LINK_SIZE);
4674          *code++ = OP_KET;
4675          PUTINC(code, 0, 1 + LINK_SIZE);
4676        }        }
4677    
4678      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4679      brackets of all kinds, and conditions with two branches (see code above).  
4680      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4681      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4682      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4683        relevant. */
4684    
4685        if (bravalue == OP_DEF) break;
4686    
4687        /* Handle updating of the required and first characters for other types of
4688        group. Update for normal brackets of all kinds, and conditions with two
4689        branches (see code above). If the bracket is followed by a quantifier with
4690        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4691        zerofirstbyte outside the main loop so that they can be accessed for the
4692        back off. */
4693    
4694      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4695      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4696      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4697    
4698      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4699        {        {
4700        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4701        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3272  for (;; ptr++) Line 4736  for (;; ptr++)
4736      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4737    
4738      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4739        break;     /* End of processing '(' */
4740    
     /* Now update the main code pointer to the end of the group. */  
4741    
4742      code = tempcode;      /* ===================================================================*/
4743        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4744      /* Error if hit end of pattern */      are arranged to be the negation of the corresponding OP_values. For the
4745        back references, the values are ESC_REF plus the reference number. Only
4746      if (*ptr != ')')      back references and those types that consume a character may be repeated.
4747        {      We can test for values between ESC_b and ESC_Z for the latter; this may
4748        *errorcodeptr = ERR14;      have to change if any new ones are ever created. */
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
4749    
4750      case '\\':      case '\\':
4751      tempptr = ptr;      tempptr = ptr;
4752      c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4753        if (*errorcodeptr != 0) goto FAILED;
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
     are arranged to be the negation of the corresponding OP_values. For the  
     back references, the values are ESC_REF plus the reference number. Only  
     back references and those types that consume a character may be repeated.  
     We can test for values between ESC_b and ESC_Z for the latter; this may  
     have to change if any new ones are ever created. */  
4754    
4755      if (c < 0)      if (c < 0)
4756        {        {
# Line 3310  for (;; ptr++) Line 4761  for (;; ptr++)
4761          continue;          continue;
4762          }          }
4763