/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 205 by ph10, Fri Aug 3 13:18:33 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 54  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 156  static const int posix_class_maps[] = { Line 180  static const int posix_class_maps[] = {
180  };  };
181    
182    
183    #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
187  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
188    they are documented. Always add a new error instead. Messages marked DEAD below
189    are no longer used. */
190    
191  static const char *error_texts[] = {  static const char *error_texts[] = {
192    "no error",    "no error",
# Line 172  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "range out of order in character class",    "range out of order in character class",
202    "nothing to repeat",    "nothing to repeat",
203    /* 10 */    /* 10 */
204    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
205    "internal error: unexpected repeat",    "internal error: unexpected repeat",
206    "unrecognized character after (?",    "unrecognized character after (?",
207    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 182  static const char *error_texts[] = { Line 211  static const char *error_texts[] = {
211    "erroffset passed as NULL",    "erroffset passed as NULL",
212    "unknown option bit(s) set",    "unknown option bit(s) set",
213    "missing ) after comment",    "missing ) after comment",
214    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
215    /* 20 */    /* 20 */
216    "regular expression too large",    "regular expression is too large",
217    "failed to get memory",    "failed to get memory",
218    "unmatched parentheses",    "unmatched parentheses",
219    "internal error: code overflow",    "internal error: code overflow",
# Line 194  static const char *error_texts[] = { Line 223  static const char *error_texts[] = {
223    "malformed number or name after (?(",    "malformed number or name after (?(",
224    "conditional group contains more than two branches",    "conditional group contains more than two branches",
225    "assertion expected after (?(",    "assertion expected after (?(",
226    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
227    /* 30 */    /* 30 */
228    "unknown POSIX class name",    "unknown POSIX class name",
229    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
230    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
231    "spare error",    "spare error",  /** DEAD **/
232    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
233    /* 35 */    /* 35 */
234    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 210  static const char *error_texts[] = { Line 239  static const char *error_texts[] = {
239    /* 40 */    /* 40 */
240    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
241    "unrecognized character after (?P",    "unrecognized character after (?P",
242    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
243    "two named subpatterns have the same name",    "two named subpatterns have the same name",
244    "invalid UTF-8 string",    "invalid UTF-8 string",
245    /* 45 */    /* 45 */
246    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
247    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
248    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p",
249    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251    /* 50 */    /* 50 */
252    "repeated subpattern is too long",    "repeated subpattern is too long",    /** DEAD **/
253    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)",
254      "internal error: overran compiling workspace",
255      "internal error: previously-checked referenced subpattern not found",
256      "DEFINE group contains more than one branch",
257      /* 55 */
258      "repeating a DEFINE group is not allowed",
259      "inconsistent NEWLINE options",
260      "\\g is not followed by a braced name or an optionally braced non-zero number",
261      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262  };  };
263    
264    
# Line 241  For convenience, we use the same bit def Line 278  For convenience, we use the same bit def
278    
279  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
282  static const unsigned char digitab[] =  static const unsigned char digitab[] =
283    {    {
284    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 314  static const unsigned char digitab[] =
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
318  static const unsigned char digitab[] =  static const unsigned char digitab[] =
319    {    {
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 328  static const unsigned char digitab[] =
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
329    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
330    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
331    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
332    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
333    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
334    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 362  static const unsigned char ebcdic_charta
362    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
363    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
365    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
366    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
367    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
368    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 389  static const unsigned char ebcdic_charta
389  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
390    
391  static BOOL  static BOOL
392    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
394    
395    
396    
# Line 363  static BOOL Line 400  static BOOL
400    
401  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
402  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
403  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
404  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406    ptr is pointing at the \. On exit, it is on the final character of the escape
407    sequence.
408    
409  Arguments:  Arguments:
410    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 398  if (c == 0) *errorcodeptr = ERR1; Line 437  if (c == 0) *errorcodeptr = ERR1;
437  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
438  Otherwise further processing may be required. */  Otherwise further processing may be required. */
439    
440  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
441  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
442  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
443    
444  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
445  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
446  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
447  #endif  #endif
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 451  else if ((i = escapes[c - 0x48]) != 0)
451  else  else
452    {    {
453    const uschar *oldptr;    const uschar *oldptr;
454      BOOL braced, negated;
455    
456    switch (c)    switch (c)
457      {      {
458      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 466  else
466      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
467      break;      break;
468    
469        /* \g must be followed by a number, either plain or braced. If positive, it
470        is an absolute backreference. If negative, it is a relative backreference.
471        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472        reference to a named group. This is part of Perl's movement towards a
473        unified syntax for back references. As this is synonymous with \k{name}, we
474        fudge it up by pretending it really was \k. */
475    
476        case 'g':
477        if (ptr[1] == '{')
478          {
479          const uschar *p;
480          for (p = ptr+2; *p != 0 && *p != '}'; p++)
481            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482          if (*p != 0 && *p != '}')
483            {
484            c = -ESC_k;
485            break;
486            }
487          braced = TRUE;
488          ptr++;
489          }
490        else braced = FALSE;
491    
492        if (ptr[1] == '-')
493          {
494          negated = TRUE;
495          ptr++;
496          }
497        else negated = FALSE;
498    
499        c = 0;
500        while ((digitab[ptr[1]] & ctype_digit) != 0)
501          c = c * 10 + *(++ptr) - '0';
502    
503        if (c == 0 || (braced && *(++ptr) != '}'))
504          {
505          *errorcodeptr = ERR57;
506          return 0;
507          }
508    
509        if (negated)
510          {
511          if (c > bracount)
512            {
513            *errorcodeptr = ERR15;
514            return 0;
515            }
516          c = bracount - (c - 1);
517          }
518    
519        c = -(ESC_REF + c);
520        break;
521    
522      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
523      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
524      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 495  else Line 589  else
589          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
590          count++;          count++;
591    
592  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
593          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
594          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
596          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
597          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598  #endif  #endif
# Line 522  else Line 616  else
616        {        {
617        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
618        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
619  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
620        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
621        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
623        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
624        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625  #endif  #endif
626        }        }
627      break;      break;
628    
629      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630        This coding is ASCII-specific, but then the whole concept of \cx is
631        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632    
633      case 'c':      case 'c':
634      c = *(++ptr);      c = *(++ptr);
# Line 542  else Line 638  else
638        return 0;        return 0;
639        }        }
640    
641      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
642      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
643      c ^= 0x40;      c ^= 0x40;
644  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
645      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
646      c ^= 0xC0;      c ^= 0xC0;
647  #endif  #endif
# Line 619  if (c == '{') Line 711  if (c == '{')
711      *negptr = TRUE;      *negptr = TRUE;
712      ptr++;      ptr++;
713      }      }
714    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
715      {      {
716      c = *(++ptr);      c = *(++ptr);
717      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 772  return p; Line 864  return p;
864    
865    
866  /*************************************************  /*************************************************
867  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
868  *************************************************/  *************************************************/
869    
870  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
871  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
872  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
873  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
874  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
875    be terminated by '>' because that is checked in the first pass.
876    
877  Arguments:  Arguments:
878    pointer      current position in the pattern    ptr          current position in the pattern
879    count        current count of capturing parens    count        current count of capturing parens so far encountered
880    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
881    namelen      name length    lorn         name length, or subpattern number if name is NULL
882      xmode        TRUE if we are in /x mode
883    
884  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
885  */  */
886    
887  static int  static int
888  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889      BOOL xmode)
890  {  {
891  const uschar *thisname;  const uschar *thisname;
892    
893  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
894    {    {
895    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
896    
897      /* Skip over backslashed characters and also entire \Q...\E */
898    
899      if (*ptr == '\\')
900        {
901        if (*(++ptr) == 0) return -1;
902        if (*ptr == 'Q') for (;;)
903          {
904          while (*(++ptr) != 0 && *ptr != '\\');
905          if (*ptr == 0) return -1;
906          if (*(++ptr) == 'E') break;
907          }
908        continue;
909        }
910    
911      /* Skip over character classes */
912    
913      if (*ptr == '[')
914        {
915        while (*(++ptr) != ']')
916          {
917          if (*ptr == '\\')
918            {
919            if (*(++ptr) == 0) return -1;
920            if (*ptr == 'Q') for (;;)
921              {
922              while (*(++ptr) != 0 && *ptr != '\\');
923              if (*ptr == 0) return -1;
924              if (*(++ptr) == 'E') break;
925              }
926            continue;
927            }
928          }
929        continue;
930        }
931    
932      /* Skip comments in /x mode */
933    
934      if (xmode && *ptr == '#')
935        {
936        while (*(++ptr) != 0 && *ptr != '\n');
937        if (*ptr == 0) return -1;
938        continue;
939        }
940    
941      /* An opening parens must now be a real metacharacter */
942    
943    if (*ptr != '(') continue;    if (*ptr != '(') continue;
944    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?')
945    if (ptr[2] == '(') { ptr += 2; continue; }      {
946    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
947        if (name == NULL && count == lorn) return count;
948        continue;
949        }
950    
951      ptr += 2;
952      if (*ptr == 'P') ptr++;                      /* Allow optional P */
953    
954      /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957           *ptr != '\'')
958        continue;
959    
960    count++;    count++;
961    ptr += 4;  
962      if (name == NULL && count == lorn) return count;
963      term = *ptr++;
964      if (term == '<') term = '>';
965    thisname = ptr;    thisname = ptr;
966    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
967    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
968          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969      return count;      return count;
970    }    }
971    
972  return -1;  return -1;
973  }  }
974    
# Line 862  for (;;) Line 1023  for (;;)
1023    
1024      case OP_CALLOUT:      case OP_CALLOUT:
1025      case OP_CREF:      case OP_CREF:
1026      case OP_BRANUMBER:      case OP_RREF:
1027        case OP_DEF:
1028      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1029      break;      break;
1030    
# Line 907  for (;;) Line 1069  for (;;)
1069    {    {
1070    int d;    int d;
1071    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1072    
1073    switch (op)    switch (op)
1074      {      {
1075        case OP_CBRA:
1076      case OP_BRA:      case OP_BRA:
1077      case OP_ONCE:      case OP_ONCE:
1078      case OP_COND:      case OP_COND:
1079      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080      if (d < 0) return d;      if (d < 0) return d;
1081      branchlength += d;      branchlength += d;
1082      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1111  for (;;)
1111      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1112    
1113      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1114      case OP_CREF:      case OP_CREF:
1115        case OP_RREF:
1116        case OP_DEF:
1117      case OP_OPT:      case OP_OPT:
1118      case OP_CALLOUT:      case OP_CALLOUT:
1119      case OP_SOD:      case OP_SOD:
# Line 1094  for (;;) Line 1257  for (;;)
1257    
1258    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260    /* Handle bracketed group */    /* Handle capturing bracket */
1261    
1262    else if (c > OP_BRA)    else if (c == OP_CBRA)
1263      {      {
1264      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1265      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1266      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1267      }      }
1268    
1269    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270    that are followed by a character may be followed by a multi-byte character.    a multi-byte character. The length in the table is a minimum, so we have to
1271    The length in the table is a minimum, so we have to scan along to skip the    arrange to skip the extra bytes. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1272    
1273    else    else
1274      {      {
1275      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1276    #ifdef SUPPORT_UTF8
1277      if (utf8) switch(c)      if (utf8) switch(c)
1278        {        {
1279        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1281  for (;;)
1281        case OP_EXACT:        case OP_EXACT:
1282        case OP_UPTO:        case OP_UPTO:
1283        case OP_MINUPTO:        case OP_MINUPTO:
1284          case OP_POSUPTO:
1285        case OP_STAR:        case OP_STAR:
1286        case OP_MINSTAR:        case OP_MINSTAR:
1287          case OP_POSSTAR:
1288        case OP_PLUS:        case OP_PLUS:
1289        case OP_MINPLUS:        case OP_MINPLUS:
1290          case OP_POSPLUS:
1291        case OP_QUERY:        case OP_QUERY:
1292        case OP_MINQUERY:        case OP_MINQUERY:
1293        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1294          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1295        break;        break;
1296        }        }
1297    #endif
1298      }      }
1299    }    }
1300  }  }
# Line 1164  for (;;) Line 1330  for (;;)
1330    
1331    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1332    
   /* All bracketed groups have the same length. */  
   
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   
1333    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334    that are followed by a character may be followed by a multi-byte character.    that are followed by a character may be followed by a multi-byte character.
1335    The length in the table is a minimum, so we have to scan along to skip the    The length in the table is a minimum, so we have to arrange to skip the extra
1336    extra bytes. All opcodes are less than 128, so we can use relatively    bytes. */
   efficient code. */  
1337    
1338    else    else
1339      {      {
1340      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1341    #ifdef SUPPORT_UTF8
1342      if (utf8) switch(c)      if (utf8) switch(c)
1343        {        {
1344        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1346  for (;;)
1346        case OP_EXACT:        case OP_EXACT:
1347        case OP_UPTO:        case OP_UPTO:
1348        case OP_MINUPTO:        case OP_MINUPTO:
1349          case OP_POSUPTO:
1350        case OP_STAR:        case OP_STAR:
1351        case OP_MINSTAR:        case OP_MINSTAR:
1352          case OP_POSSTAR:
1353        case OP_PLUS:        case OP_PLUS:
1354        case OP_MINPLUS:        case OP_MINPLUS:
1355          case OP_POSPLUS:
1356        case OP_QUERY:        case OP_QUERY:
1357        case OP_MINQUERY:        case OP_MINQUERY:
1358        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1359          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1360        break;        break;
1361        }        }
1362    #endif
1363      }      }
1364    }    }
1365  }  }
# Line 1207  for (;;) Line 1371  for (;;)
1371  *************************************************/  *************************************************/
1372    
1373  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1374  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1375  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1376  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1377  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378    struck an inner bracket whose current branch will already have been scanned.
1379    
1380  Arguments:  Arguments:
1381    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1389  static BOOL
1389  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390  {  {
1391  register int c;  register int c;
1392  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393       code < endcode;       code < endcode;
1394       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395    {    {
# Line 1232  for (code = first_significant_code(code Line 1397  for (code = first_significant_code(code
1397    
1398    c = *code;    c = *code;
1399    
1400    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1401    
1402      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403        {
1404        code += _pcre_OP_lengths[c];
1405        do code += GET(code, 1); while (*code == OP_ALT);
1406        c = *code;
1407        continue;
1408        }
1409    
1410      /* For other groups, scan the branches. */
1411    
1412      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1413      {      {
1414      BOOL empty_branch;      BOOL empty_branch;
1415      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1425  for (code = first_significant_code(code
1425        }        }
1426      while (*code == OP_ALT);      while (*code == OP_ALT);
1427      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1428      c = *code;      c = *code;
1429        continue;
1430      }      }
1431    
1432    else switch (c)    /* Handle the other opcodes */
1433    
1434      switch (c)
1435      {      {
1436      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1437    
# Line 1308  for (code = first_significant_code(code Line 1487  for (code = first_significant_code(code
1487      case OP_NOT:      case OP_NOT:
1488      case OP_PLUS:      case OP_PLUS:
1489      case OP_MINPLUS:      case OP_MINPLUS:
1490        case OP_POSPLUS:
1491      case OP_EXACT:      case OP_EXACT:
1492      case OP_NOTPLUS:      case OP_NOTPLUS:
1493      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1494        case OP_NOTPOSPLUS:
1495      case OP_NOTEXACT:      case OP_NOTEXACT:
1496      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1497      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1498        case OP_TYPEPOSPLUS:
1499      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1500      return FALSE;      return FALSE;
1501    
# Line 1325  for (code = first_significant_code(code Line 1507  for (code = first_significant_code(code
1507      case OP_ALT:      case OP_ALT:
1508      return TRUE;      return TRUE;
1509    
1510      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1512    
1513  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1514      case OP_STAR:      case OP_STAR:
1515      case OP_MINSTAR:      case OP_MINSTAR:
1516        case OP_POSSTAR:
1517      case OP_QUERY:      case OP_QUERY:
1518      case OP_MINQUERY:      case OP_MINQUERY:
1519        case OP_POSQUERY:
1520      case OP_UPTO:      case OP_UPTO:
1521      case OP_MINUPTO:      case OP_MINUPTO:
1522        case OP_POSUPTO:
1523      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524      break;      break;
1525  #endif  #endif
# Line 1452  earlier groups that are outside the curr Line 1637  earlier groups that are outside the curr
1637  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1639  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1640  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1641  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1642    
1643    This function has been extended with the possibility of forward references for
1644    recursions and subroutine calls. It must also check the list of such references
1645    for the group we are dealing with. If it finds that one of the recursions in
1646    the current group is on this list, it adjusts the offset in the list, not the
1647    value in the reference (which is a group number).
1648    
1649  Arguments:  Arguments:
1650    group      points to the start of the group    group      points to the start of the group
1651    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1652    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1653    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1654      save_hwm   the hwm forward reference pointer at the start of the group
1655    
1656  Returns:     nothing  Returns:     nothing
1657  */  */
1658    
1659  static void  static void
1660  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661      uschar *save_hwm)
1662  {  {
1663  uschar *ptr = group;  uschar *ptr = group;
1664  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665    {    {
1666    int offset = GET(ptr, 1);    int offset;
1667    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1668    
1669      /* See if this recursion is on the forward reference list. If so, adjust the
1670      reference. */
1671    
1672      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673        {
1674        offset = GET(hc, 0);
1675        if (cd->start_code + offset == ptr + 1)
1676          {
1677          PUT(hc, 0, offset + adjust);
1678          break;
1679          }
1680        }
1681    
1682      /* Otherwise, adjust the recursion offset if it's after the start of this
1683      group. */
1684    
1685      if (hc >= cd->hwm)
1686        {
1687        offset = GET(ptr, 1);
1688        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689        }
1690    
1691    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1692    }    }
1693  }  }
# Line 1550  Yield: TRUE when range returned; Line 1766  Yield: TRUE when range returned;
1766  */  */
1767    
1768  static BOOL  static BOOL
1769  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770      unsigned int *odptr)
1771  {  {
1772  int c, othercase, next;  unsigned int c, othercase, next;
1773    
1774  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1775    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776    
1777  if (c > d) return FALSE;  if (c > d) return FALSE;
1778    
# Line 1576  return TRUE; Line 1793  return TRUE;
1793  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1794    
1795    
1796    
1797    /*************************************************
1798    *     Check if auto-possessifying is possible    *
1799    *************************************************/
1800    
1801    /* This function is called for unlimited repeats of certain items, to see
1802    whether the next thing could possibly match the repeated item. If not, it makes
1803    sense to automatically possessify the repeated item.
1804    
1805    Arguments:
1806      op_code       the repeated op code
1807      this          data for this item, depends on the opcode
1808      utf8          TRUE in UTF-8 mode
1809      utf8_char     used for utf8 character bytes, NULL if not relevant
1810      ptr           next character in pattern
1811      options       options bits
1812      cd            contains pointers to tables etc.
1813    
1814    Returns:        TRUE if possessifying is wanted
1815    */
1816    
1817    static BOOL
1818    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819      const uschar *ptr, int options, compile_data *cd)
1820    {
1821    int next;
1822    
1823    /* Skip whitespace and comments in extended mode */
1824    
1825    if ((options & PCRE_EXTENDED) != 0)
1826      {
1827      for (;;)
1828        {
1829        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830        if (*ptr == '#')
1831          {
1832          while (*(++ptr) != 0)
1833            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834          }
1835        else break;
1836        }
1837      }
1838    
1839    /* If the next item is one that we can handle, get its value. A non-negative
1840    value is a character, a negative value is an escape value. */
1841    
1842    if (*ptr == '\\')
1843      {
1844      int temperrorcode = 0;
1845      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846      if (temperrorcode != 0) return FALSE;
1847      ptr++;    /* Point after the escape sequence */
1848      }
1849    
1850    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851      {
1852    #ifdef SUPPORT_UTF8
1853      if (utf8) { GETCHARINC(next, ptr); } else
1854    #endif
1855      next = *ptr++;
1856      }
1857    
1858    else return FALSE;
1859    
1860    /* Skip whitespace and comments in extended mode */
1861    
1862    if ((options & PCRE_EXTENDED) != 0)
1863      {
1864      for (;;)
1865        {
1866        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867        if (*ptr == '#')
1868          {
1869          while (*(++ptr) != 0)
1870            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871          }
1872        else break;
1873        }
1874      }
1875    
1876    /* If the next thing is itself optional, we have to give up. */
1877    
1878    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879      return FALSE;
1880    
1881    /* Now compare the next item with the previous opcode. If the previous is a
1882    positive single character match, "item" either contains the character or, if
1883    "item" is greater than 127 in utf8 mode, the character's bytes are in
1884    utf8_char. */
1885    
1886    
1887    /* Handle cases when the next item is a character. */
1888    
1889    if (next >= 0) switch(op_code)
1890      {
1891      case OP_CHAR:
1892    #ifdef SUPPORT_UTF8
1893      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    #endif
1895      return item != next;
1896    
1897      /* For CHARNC (caseless character) we must check the other case. If we have
1898      Unicode property support, we can use it to test the other case of
1899      high-valued characters. */
1900    
1901      case OP_CHARNC:
1902    #ifdef SUPPORT_UTF8
1903      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904    #endif
1905      if (item == next) return FALSE;
1906    #ifdef SUPPORT_UTF8
1907      if (utf8)
1908        {
1909        unsigned int othercase;
1910        if (next < 128) othercase = cd->fcc[next]; else
1911    #ifdef SUPPORT_UCP
1912        othercase = _pcre_ucp_othercase((unsigned int)next);
1913    #else
1914        othercase = NOTACHAR;
1915    #endif
1916        return (unsigned int)item != othercase;
1917        }
1918      else
1919    #endif  /* SUPPORT_UTF8 */
1920      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1921    
1922      /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924      case OP_NOT:
1925      if (next < 0) return FALSE;  /* Not a character */
1926      if (item == next) return TRUE;
1927      if ((options & PCRE_CASELESS) == 0) return FALSE;
1928    #ifdef SUPPORT_UTF8
1929      if (utf8)
1930        {
1931        unsigned int othercase;
1932        if (next < 128) othercase = cd->fcc[next]; else
1933    #ifdef SUPPORT_UCP
1934        othercase = _pcre_ucp_othercase(next);
1935    #else
1936        othercase = NOTACHAR;
1937    #endif
1938        return (unsigned int)item == othercase;
1939        }
1940      else
1941    #endif  /* SUPPORT_UTF8 */
1942      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1943    
1944      case OP_DIGIT:
1945      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947      case OP_NOT_DIGIT:
1948      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950      case OP_WHITESPACE:
1951      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953      case OP_NOT_WHITESPACE:
1954      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956      case OP_WORDCHAR:
1957      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959      case OP_NOT_WORDCHAR:
1960      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962      case OP_HSPACE:
1963      case OP_NOT_HSPACE:
1964      switch(next)
1965        {
1966        case 0x09:
1967        case 0x20:
1968        case 0xa0:
1969        case 0x1680:
1970        case 0x180e:
1971        case 0x2000:
1972        case 0x2001:
1973        case 0x2002:
1974        case 0x2003:
1975        case 0x2004:
1976        case 0x2005:
1977        case 0x2006:
1978        case 0x2007:
1979        case 0x2008:
1980        case 0x2009:
1981        case 0x200A:
1982        case 0x202f:
1983        case 0x205f:
1984        case 0x3000:
1985        return op_code != OP_HSPACE;
1986        default:
1987        return op_code == OP_HSPACE;
1988        }
1989    
1990      case OP_VSPACE:
1991      case OP_NOT_VSPACE:
1992      switch(next)
1993        {
1994        case 0x0a:
1995        case 0x0b:
1996        case 0x0c:
1997        case 0x0d:
1998        case 0x85:
1999        case 0x2028:
2000        case 0x2029:
2001        return op_code != OP_VSPACE;
2002        default:
2003        return op_code == OP_VSPACE;
2004        }
2005    
2006      default:
2007      return FALSE;
2008      }
2009    
2010    
2011    /* Handle the case when the next item is \d, \s, etc. */
2012    
2013    switch(op_code)
2014      {
2015      case OP_CHAR:
2016      case OP_CHARNC:
2017    #ifdef SUPPORT_UTF8
2018      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019    #endif
2020      switch(-next)
2021        {
2022        case ESC_d:
2023        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025        case ESC_D:
2026        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028        case ESC_s:
2029        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031        case ESC_S:
2032        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034        case ESC_w:
2035        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037        case ESC_W:
2038        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039    
2040        case ESC_h:
2041        case ESC_H:
2042        switch(item)
2043          {
2044          case 0x09:
2045          case 0x20:
2046          case 0xa0:
2047          case 0x1680:
2048          case 0x180e:
2049          case 0x2000:
2050          case 0x2001:
2051          case 0x2002:
2052          case 0x2003:
2053          case 0x2004:
2054          case 0x2005:
2055          case 0x2006:
2056          case 0x2007:
2057          case 0x2008:
2058          case 0x2009:
2059          case 0x200A:
2060          case 0x202f:
2061          case 0x205f:
2062          case 0x3000:
2063          return -next != ESC_h;
2064          default:
2065          return -next == ESC_h;
2066          }
2067    
2068        case ESC_v:
2069        case ESC_V:
2070        switch(item)
2071          {
2072          case 0x0a:
2073          case 0x0b:
2074          case 0x0c:
2075          case 0x0d:
2076          case 0x85:
2077          case 0x2028:
2078          case 0x2029:
2079          return -next != ESC_v;
2080          default:
2081          return -next == ESC_v;
2082          }
2083    
2084        default:
2085        return FALSE;
2086        }
2087    
2088      case OP_DIGIT:
2089      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090             next == -ESC_h || next == -ESC_v;
2091    
2092      case OP_NOT_DIGIT:
2093      return next == -ESC_d;
2094    
2095      case OP_WHITESPACE:
2096      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098      case OP_NOT_WHITESPACE:
2099      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100    
2101      case OP_HSPACE:
2102      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_HSPACE:
2105      return next == -ESC_h;
2106    
2107      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108      case OP_VSPACE:
2109      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111      case OP_NOT_VSPACE:
2112      return next == -ESC_v;
2113    
2114      case OP_WORDCHAR:
2115      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116    
2117      case OP_NOT_WORDCHAR:
2118      return next == -ESC_w || next == -ESC_d;
2119    
2120      default:
2121      return FALSE;
2122      }
2123    
2124    /* Control does not reach here */
2125    }
2126    
2127    
2128    
2129  /*************************************************  /*************************************************
2130  *           Compile one branch                   *  *           Compile one branch                   *
2131  *************************************************/  *************************************************/
2132    
2133  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2134  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2135  bits.  bits. This function is used during the pre-compile phase when we are trying
2136    to find out the amount of memory needed, as well as during the real compile
2137    phase. The value of lengthptr distinguishes the two phases.
2138    
2139  Arguments:  Arguments:
2140    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2141    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2142    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2143    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1594  Arguments: Line 2145  Arguments:
2145    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2146    bcptr          points to current branch chain    bcptr          points to current branch chain
2147    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2148      lengthptr      NULL during the real compile phase
2149                     points to length accumulator during pre-compile phase
2150    
2151  Returns:         TRUE on success  Returns:         TRUE on success
2152                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2153  */  */
2154    
2155  static BOOL  static BOOL
2156  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2159  {  {
2160  int repeat_type, op_type;  int repeat_type, op_type;
2161  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1613  int zeroreqbyte, zerofirstbyte; Line 2166  int zeroreqbyte, zerofirstbyte;
2166  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2167  int options = *optionsptr;  int options = *optionsptr;
2168  int after_manual_callout = 0;  int after_manual_callout = 0;
2169    int length_prevgroup = 0;
2170  register int c;  register int c;
2171  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2172    uschar *last_code = code;
2173    uschar *orig_code = code;
2174  uschar *tempcode;  uschar *tempcode;
2175  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2176  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1622  const uschar *ptr = *ptrptr; Line 2178  const uschar *ptr = *ptrptr;
2178  const uschar *tempptr;  const uschar *tempptr;
2179  uschar *previous = NULL;  uschar *previous = NULL;
2180  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2181    uschar *save_hwm = NULL;
2182  uschar classbits[32];  uschar classbits[32];
2183    
2184  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1631  uschar *class_utf8data; Line 2188  uschar *class_utf8data;
2188  uschar utf8_char[6];  uschar utf8_char[6];
2189  #else  #else
2190  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2191    uschar *utf8_char = NULL;
2192    #endif
2193    
2194    #ifdef DEBUG
2195    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196  #endif  #endif
2197    
2198  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1664  for (;; ptr++) Line 2226  for (;; ptr++)
2226    BOOL negate_class;    BOOL negate_class;
2227    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2228    BOOL is_quantifier;    BOOL is_quantifier;
2229      BOOL is_recurse;
2230      BOOL reset_bracount;
2231    int class_charcount;    int class_charcount;
2232    int class_lastchar;    int class_lastchar;
2233    int newoptions;    int newoptions;
2234    int recno;    int recno;
2235      int refsign;
2236    int skipbytes;    int skipbytes;
2237    int subreqbyte;    int subreqbyte;
2238    int subfirstbyte;    int subfirstbyte;
2239      int terminator;
2240    int mclength;    int mclength;
2241    uschar mcbuffer[8];    uschar mcbuffer[8];
2242    
2243    /* Next byte in the pattern */    /* Get next byte in the pattern */
2244    
2245    c = *ptr;    c = *ptr;
2246    
2247    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If we are in the pre-compile phase, accumulate the length used for the
2248      previous cycle of this loop. */
2249    
2250    if (inescq && c != 0)    if (lengthptr != NULL)
2251      {      {
2252      if (c == '\\' && ptr[1] == 'E')  #ifdef DEBUG
2253        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2254    #endif
2255        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256        {        {
2257        inescq = FALSE;        *errorcodeptr = ERR52;
2258        ptr++;        goto FAILED;
       continue;  
2259        }        }
2260      else  
2261        /* There is at least one situation where code goes backwards: this is the
2262        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263        the class is simply eliminated. However, it is created first, so we have to
2264        allow memory for it. Therefore, don't ever reduce the length at this point.
2265        */
2266    
2267        if (code < last_code) code = last_code;
2268    
2269        /* Paranoid check for integer overflow */
2270    
2271        if (OFLOW_MAX - *lengthptr < code - last_code)
2272        {        {
2273        if (previous_callout != NULL)        *errorcodeptr = ERR20;
2274          goto FAILED;
2275          }
2276    
2277        *lengthptr += code - last_code;
2278        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280        /* If "previous" is set and it is not at the start of the work space, move
2281        it back to there, in order to avoid filling up the work space. Otherwise,
2282        if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284        if (previous != NULL)
2285          {
2286          if (previous > orig_code)
2287          {          {
2288          complete_callout(previous_callout, ptr, cd);          memmove(orig_code, previous, code - previous);
2289            code -= previous - orig_code;
2290            previous = orig_code;
2291            }
2292          }
2293        else code = orig_code;
2294    
2295        /* Remember where this code item starts so we can pick up the length
2296        next time round. */
2297    
2298        last_code = code;
2299        }
2300    
2301      /* In the real compile phase, just check the workspace used by the forward
2302      reference list. */
2303    
2304      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305        {
2306        *errorcodeptr = ERR52;
2307        goto FAILED;
2308        }
2309    
2310      /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312      if (inescq && c != 0)
2313        {
2314        if (c == '\\' && ptr[1] == 'E')
2315          {
2316          inescq = FALSE;
2317          ptr++;
2318          continue;
2319          }
2320        else
2321          {
2322          if (previous_callout != NULL)
2323            {
2324            if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2325              complete_callout(previous_callout, ptr, cd);
2326          previous_callout = NULL;          previous_callout = NULL;
2327          }          }
2328        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2343  for (;; ptr++)
2343    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2344         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2345      {      {
2346      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2347          complete_callout(previous_callout, ptr, cd);
2348      previous_callout = NULL;      previous_callout = NULL;
2349      }      }
2350    
# Line 1724  for (;; ptr++) Line 2355  for (;; ptr++)
2355      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356      if (c == '#')      if (c == '#')
2357        {        {
2358        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2359          {          {
2360          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2361          }          }
2362          if (*ptr != 0) continue;
2363    
2364        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2365        c = 0;        c = 0;
2366        }        }
# Line 1745  for (;; ptr++) Line 2376  for (;; ptr++)
2376    
2377    switch(c)    switch(c)
2378      {      {
2379      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2380        case 0:                        /* The branch terminates at string end */
2381      case 0:      case '|':                      /* or | or ) */
     case '|':  
2382      case ')':      case ')':
2383      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2384      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2385      *codeptr = code;      *codeptr = code;
2386      *ptrptr = ptr;      *ptrptr = ptr;
2387        if (lengthptr != NULL)
2388          {
2389          if (OFLOW_MAX - *lengthptr < code - last_code)
2390            {
2391            *errorcodeptr = ERR20;
2392            goto FAILED;
2393            }
2394          *lengthptr += code - last_code;   /* To include callout length */
2395          DPRINTF((">> end branch\n"));
2396          }
2397      return TRUE;      return TRUE;
2398    
2399    
2400        /* ===================================================================*/
2401      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2402      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2403    
# Line 1784  for (;; ptr++) Line 2426  for (;; ptr++)
2426      *code++ = OP_ANY;      *code++ = OP_ANY;
2427      break;      break;
2428    
2429    
2430        /* ===================================================================*/
2431      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2432      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2433      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1809  for (;; ptr++) Line 2453  for (;; ptr++)
2453        goto FAILED;        goto FAILED;
2454        }        }
2455    
2456      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2457        if the first few characters (either before or after ^) are \Q\E or \E we
2458      if ((c = *(++ptr)) == '^')      skip them too. This makes for compatibility with Perl. */
2459    
2460        negate_class = FALSE;
2461        for (;;)
2462        {        {
       negate_class = TRUE;  
2463        c = *(++ptr);        c = *(++ptr);
2464        }        if (c == '\\')
2465      else          {
2466        {          if (ptr[1] == 'E') ptr++;
2467        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2468        }              else break;
2469            }
2470          else if (!negate_class && c == '^')
2471            negate_class = TRUE;
2472          else break;
2473          }
2474    
2475      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2476      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2477      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2478    
2479      class_charcount = 0;      class_charcount = 0;
2480      class_lastchar = -1;      class_lastchar = -1;
2481    
2482        /* Initialize the 32-char bit map to all zeros. We build the map in a
2483        temporary bit of memory, in case the class contains only 1 character (less
2484        than 256), because in that case the compiled code doesn't use the bit map.
2485        */
2486    
2487        memset(classbits, 0, 32 * sizeof(uschar));
2488    
2489  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2490      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2491      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2492  #endif  #endif
2493    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2494      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2495      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2496      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2497    
2498      do      if (c != 0) do
2499        {        {
2500          const uschar *oldptr;
2501    
2502  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2503        if (utf8 && c > 127)        if (utf8 && c > 127)
2504          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1859  for (;; ptr++) Line 2510  for (;; ptr++)
2510    
2511        if (inescq)        if (inescq)
2512          {          {
2513          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2514            {            {
2515            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2516            ptr++;            ptr++;                            /* Skip the 'E' */
2517            continue;            continue;                         /* Carry on with next */
2518            }            }
2519          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2520          }          }
2521    
2522        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1956  for (;; ptr++) Line 2607  for (;; ptr++)
2607          }          }
2608    
2609        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2610        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2611        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2612        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2613        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2614        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2615    
2616        if (c == '\\')        if (c == '\\')
2617          {          {
2618          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2619            if (*errorcodeptr != 0) goto FAILED;
2620    
2621          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2622          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2623            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2624          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2625            {            {
2626            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1983  for (;; ptr++) Line 2635  for (;; ptr++)
2635            {            {
2636            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2637            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2638            switch (-c)  
2639              /* Save time by not doing this in the pre-compile phase. */
2640    
2641              if (lengthptr == NULL) switch (-c)
2642              {              {
2643              case ESC_d:              case ESC_d:
2644              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 2011  for (;; ptr++) Line 2666  for (;; ptr++)
2666              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2667              continue;              continue;
2668    
2669  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2670              case ESC_p:              continue;
2671              case ESC_P:  
2672                default:    /* Not recognized; fall through */
2673                break;      /* Need "default" setting to stop compiler warning. */
2674                }
2675    
2676              /* In the pre-compile phase, just do the recognition. */
2677    
2678              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2679                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2680    
2681              /* We need to deal with \H, \h, \V, and \v in both phases because
2682              they use extra memory. */
2683    
2684              if (-c == ESC_h)
2685                {
2686                SETBIT(classbits, 0x09); /* VT */
2687                SETBIT(classbits, 0x20); /* SPACE */
2688                SETBIT(classbits, 0xa0); /* NSBP */
2689    #ifdef SUPPORT_UTF8
2690                if (utf8)
2691                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2692                class_utf8 = TRUE;                class_utf8 = TRUE;
2693                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2694                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2695                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2696                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2697                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2698                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2699                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2700                  *class_utf8data++ = XCL_SINGLE;
2701                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2702                  *class_utf8data++ = XCL_SINGLE;
2703                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2704                  *class_utf8data++ = XCL_SINGLE;
2705                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2706                }                }
2707    #endif
2708              continue;              continue;
2709                }
2710    
2711              if (-c == ESC_H)
2712                {
2713                for (c = 0; c < 32; c++)
2714                  {
2715                  int x = 0xff;
2716                  switch (c)
2717                    {
2718                    case 0x09/8: x ^= 1 << (0x09%8); break;
2719                    case 0x20/8: x ^= 1 << (0x20%8); break;
2720                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2721                    default: break;
2722                    }
2723                  classbits[c] |= x;
2724                  }
2725    
2726    #ifdef SUPPORT_UTF8
2727                if (utf8)
2728                  {
2729                  class_utf8 = TRUE;
2730                  *class_utf8data++ = XCL_RANGE;
2731                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2732                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2733                  *class_utf8data++ = XCL_RANGE;
2734                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2735                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2736                  *class_utf8data++ = XCL_RANGE;
2737                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2738                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2739                  *class_utf8data++ = XCL_RANGE;
2740                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2741                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2742                  *class_utf8data++ = XCL_RANGE;
2743                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2744                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2745                  *class_utf8data++ = XCL_RANGE;
2746                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2747                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2748                  *class_utf8data++ = XCL_RANGE;
2749                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2750                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2751                  }
2752  #endif  #endif
2753                continue;
2754                }
2755    
2756              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_v)
2757              strict mode. By default, for compatibility with Perl, they are              {
2758              treated as literals. */              SETBIT(classbits, 0x0a); /* LF */
2759                SETBIT(classbits, 0x0b); /* VT */
2760                SETBIT(classbits, 0x0c); /* FF */
2761                SETBIT(classbits, 0x0d); /* CR */
2762                SETBIT(classbits, 0x85); /* NEL */
2763    #ifdef SUPPORT_UTF8
2764                if (utf8)
2765                  {
2766                  class_utf8 = TRUE;
2767                  *class_utf8data++ = XCL_RANGE;
2768                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2769                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2770                  }
2771    #endif
2772                continue;
2773                }
2774    
2775              default:            if (-c == ESC_V)
2776              if ((options & PCRE_EXTRA) != 0)              {
2777                for (c = 0; c < 32; c++)
2778                {                {
2779                *errorcodeptr = ERR7;                int x = 0xff;
2780                goto FAILED;                switch (c)
2781                    {
2782                    case 0x0a/8: x ^= 1 << (0x0a%8);
2783                                 x ^= 1 << (0x0b%8);
2784                                 x ^= 1 << (0x0c%8);
2785                                 x ^= 1 << (0x0d%8);
2786                                 break;
2787                    case 0x85/8: x ^= 1 << (0x85%8); break;
2788                    default: break;
2789                    }
2790                  classbits[c] |= x;
2791                  }
2792    
2793    #ifdef SUPPORT_UTF8
2794                if (utf8)
2795                  {
2796                  class_utf8 = TRUE;
2797                  *class_utf8data++ = XCL_RANGE;
2798                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2799                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2800                  *class_utf8data++ = XCL_RANGE;
2801                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2802                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2803                }                }
2804              c = *ptr;              /* The final character */  #endif
2805              class_charcount -= 2;  /* Undo the default count from above */              continue;
2806                }
2807    
2808              /* We need to deal with \P and \p in both phases. */
2809    
2810    #ifdef SUPPORT_UCP
2811              if (-c == ESC_p || -c == ESC_P)
2812                {
2813                BOOL negated;
2814                int pdata;
2815                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2816                if (ptype < 0) goto FAILED;
2817                class_utf8 = TRUE;
2818                *class_utf8data++ = ((-c == ESC_p) != negated)?
2819                  XCL_PROP : XCL_NOTPROP;
2820                *class_utf8data++ = ptype;
2821                *class_utf8data++ = pdata;
2822                class_charcount -= 2;   /* Not a < 256 character */
2823                continue;
2824                }
2825    #endif
2826              /* Unrecognized escapes are faulted if PCRE is running in its
2827              strict mode. By default, for compatibility with Perl, they are
2828              treated as literals. */
2829    
2830              if ((options & PCRE_EXTRA) != 0)
2831                {
2832                *errorcodeptr = ERR7;
2833                goto FAILED;
2834              }              }
2835    
2836              class_charcount -= 2;  /* Undo the default count from above */
2837              c = *ptr;              /* Get the final character and fall through */
2838            }            }
2839    
2840          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2841          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2842    
2843          }   /* End of backslash handling */          }   /* End of backslash handling */
2844    
2845        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2846        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2847        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2848          entirely. The code for handling \Q and \E is messy. */
2849    
2850          CHECK_RANGE:
2851          while (ptr[1] == '\\' && ptr[2] == 'E')
2852            {
2853            inescq = FALSE;
2854            ptr += 2;
2855            }
2856    
2857        if (ptr[1] == '-' && ptr[2] != ']')        oldptr = ptr;
2858    
2859          if (!inescq && ptr[1] == '-')
2860          {          {
2861          int d;          int d;
2862          ptr += 2;          ptr += 2;
2863            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2864    
2865            /* If we hit \Q (not followed by \E) at this point, go into escaped
2866            mode. */
2867    
2868            while (*ptr == '\\' && ptr[1] == 'Q')
2869              {
2870              ptr += 2;
2871              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2872              inescq = TRUE;
2873              break;
2874              }
2875    
2876            if (*ptr == 0 || (!inescq && *ptr == ']'))
2877              {
2878              ptr = oldptr;
2879              goto LONE_SINGLE_CHARACTER;
2880              }
2881    
2882  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2883          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 2892  for (;; ptr++)
2892          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2893          in such circumstances. */          in such circumstances. */
2894    
2895          if (d == '\\')          if (!inescq && d == '\\')
2896            {            {
2897            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2898            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2899    
2900            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2901            was literal */            special means the '-' was literal */
2902    
2903            if (d < 0)            if (d < 0)
2904              {              {
2905              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2906              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2907                else if (d == -ESC_R) d = 'R'; else
2908                {                {
2909                ptr = oldptr - 2;                ptr = oldptr;
2910                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2911                }                }
2912              }              }
2913            }            }
2914    
2915          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2916          the pre-pass. Optimize one-character ranges */          one-character ranges */
2917    
2918            if (d < c)
2919              {
2920              *errorcodeptr = ERR8;
2921              goto FAILED;
2922              }
2923    
2924          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2925    
# Line 2112  for (;; ptr++) Line 2940  for (;; ptr++)
2940  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2941            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2942              {              {
2943              int occ, ocd;              unsigned int occ, ocd;
2944              int cc = c;              unsigned int cc = c;
2945              int origd = d;              unsigned int origd = d;
2946              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2947                {                {
2948                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2949                      ocd <= (unsigned int)d)
2950                    continue;                          /* Skip embedded ranges */
2951    
2952                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2953                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2954                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2955                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2956                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2957                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2958                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2959                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2960                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2961                  d = ocd;                  d = ocd;
2962                  continue;                  continue;
# Line 2172  for (;; ptr++) Line 3004  for (;; ptr++)
3004          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3005          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3006    
3007          for (; c <= d; c++)          class_charcount += d - c + 1;
3008            class_lastchar = d;
3009    
3010            /* We can save a bit of time by skipping this in the pre-compile. */
3011    
3012            if (lengthptr == NULL) for (; c <= d; c++)
3013            {            {
3014            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3015            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 3017  for (;; ptr++)
3017              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3018              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3019              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3020            }            }
3021    
3022          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 3040  for (;; ptr++)
3040  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3041          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3042            {            {
3043            int othercase;            unsigned int othercase;
3044            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3045              {              {
3046              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3047              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 3066  for (;; ptr++)
3066          }          }
3067        }        }
3068    
3069      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3070    
3071      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3072    
3073        if (c == 0)                          /* Missing terminating ']' */
3074          {
3075          *errorcodeptr = ERR6;
3076          goto FAILED;
3077          }
3078    
3079      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3080      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3081      can optimize the negative case only if there were no characters >= 128      can optimize the negative case only if there were no characters >= 128
# Line 2298  for (;; ptr++) Line 3138  for (;; ptr++)
3138    
3139      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3140      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3141      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3142    
3143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3144      if (class_utf8)      if (class_utf8)
# Line 2308  for (;; ptr++) Line 3148  for (;; ptr++)
3148        code += LINK_SIZE;        code += LINK_SIZE;
3149        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3150    
3151        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3152        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3153    
3154        if (class_charcount > 0)        if (class_charcount > 0)
3155          {          {
3156          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3157            memmove(code + 32, code, class_utf8data - code);
3158          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3159          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3160          }          }
3161          else code = class_utf8data;
3162    
3163        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3164    
# Line 2342  for (;; ptr++) Line 3175  for (;; ptr++)
3175      if (negate_class)      if (negate_class)
3176        {        {
3177        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3178        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3179            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3180        }        }
3181      else      else
3182        {        {
# Line 2352  for (;; ptr++) Line 3186  for (;; ptr++)
3186      code += 32;      code += 32;
3187      break;      break;
3188    
3189    
3190        /* ===================================================================*/
3191      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3192      has been tested above. */      has been tested above. */
3193    
# Line 2419  for (;; ptr++) Line 3255  for (;; ptr++)
3255        }        }
3256      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3257    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3258      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3259      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3260      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3288  for (;; ptr++)
3288          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3289          }          }
3290    
3291          /* If the repetition is unlimited, it pays to see if the next thing on
3292          the line is something that cannot possibly match this character. If so,
3293          automatically possessifying this item gains some performance in the case
3294          where the match fails. */
3295    
3296          if (!possessive_quantifier &&
3297              repeat_max < 0 &&
3298              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3299                options, cd))
3300            {
3301            repeat_type = 0;    /* Force greedy */
3302            possessive_quantifier = TRUE;
3303            }
3304    
3305        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3306        }        }
3307    
3308      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3309      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3310      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3311      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3312        currently used only for single-byte chars. */
3313    
3314      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3315        {        {
3316        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3317        c = previous[1];        c = previous[1];
3318          if (!possessive_quantifier &&
3319              repeat_max < 0 &&
3320              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3321            {
3322            repeat_type = 0;    /* Force greedy */
3323            possessive_quantifier = TRUE;
3324            }
3325        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3326        }        }
3327    
# Line 2495  for (;; ptr++) Line 3339  for (;; ptr++)
3339        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3340        c = *previous;        c = *previous;
3341    
3342          if (!possessive_quantifier &&
3343              repeat_max < 0 &&
3344              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3345            {
3346            repeat_type = 0;    /* Force greedy */
3347            possessive_quantifier = TRUE;
3348            }
3349    
3350        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3351        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3352          {          {
# Line 2535  for (;; ptr++) Line 3387  for (;; ptr++)
3387          }          }
3388    
3389        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3390        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3391        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3392        one less than the maximum. */        one less than the maximum. */
3393    
# Line 2588  for (;; ptr++) Line 3440  for (;; ptr++)
3440            }            }
3441    
3442          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3443          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3444            UPTO is just for 1 instance, we can use QUERY instead. */
3445    
3446          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3447            {            {
# Line 2607  for (;; ptr++) Line 3460  for (;; ptr++)
3460              *code++ = prop_value;              *code++ = prop_value;
3461              }              }
3462            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3463            *code++ = OP_UPTO + repeat_type;  
3464            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3465                {
3466                *code++ = OP_QUERY + repeat_type;
3467                }
3468              else
3469                {
3470                *code++ = OP_UPTO + repeat_type;
3471                PUT2INC(code, 0, repeat_max);
3472                }
3473            }            }
3474          }          }
3475    
# Line 2675  for (;; ptr++) Line 3536  for (;; ptr++)
3536      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3537      cases. */      cases. */
3538    
3539      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3540               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3541        {        {
3542        register int i;        register int i;
3543        int ketoffset = 0;        int ketoffset = 0;
3544        int len = code - previous;        int len = code - previous;
3545        uschar *bralink = NULL;        uschar *bralink = NULL;
3546    
3547          /* Repeating a DEFINE group is pointless */
3548    
3549          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3550            {
3551            *errorcodeptr = ERR55;
3552            goto FAILED;
3553            }
3554    
3555        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3556        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3557        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3586  for (;; ptr++)
3586          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3587          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3588          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3589          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3590          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3591            doing this. */
3592    
3593          if (repeat_max <= 1)          if (repeat_max <= 1)
3594            {            {
3595            *code = OP_END;            *code = OP_END;
3596            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3597            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3598            code++;            code++;
3599            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3611  for (;; ptr++)
3611            {            {
3612            int offset;            int offset;
3613            *code = OP_END;            *code = OP_END;
3614            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3615            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3616            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3617            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3631  for (;; ptr++)
3631        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3632        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3633        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3634        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3635          forward reference subroutine calls in the group, there will be entries on
3636          the workspace list; replicate these with an appropriate increment. */
3637    
3638        else        else
3639          {          {
3640          if (repeat_min > 1)          if (repeat_min > 1)
3641            {            {
3642            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3643            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3644              potential integer overflow. */
3645    
3646              if (lengthptr != NULL)
3647                {
3648                int delta = (repeat_min - 1)*length_prevgroup;
3649                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3650                                                                (double)INT_MAX ||
3651                    OFLOW_MAX - *lengthptr < delta)
3652                  {
3653                  *errorcodeptr = ERR20;
3654                  goto FAILED;
3655                  }
3656                *lengthptr += delta;
3657                }
3658    
3659              /* This is compiling for real */
3660    
3661              else
3662              {              {
3663              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3664              code += len;              for (i = 1; i < repeat_min; i++)
3665                  {
3666                  uschar *hc;
3667                  uschar *this_hwm = cd->hwm;
3668                  memcpy(code, previous, len);
3669                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3670                    {
3671                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3672                    cd->hwm += LINK_SIZE;
3673                    }
3674                  save_hwm = this_hwm;
3675                  code += len;
3676                  }
3677              }              }
3678            }            }
3679    
3680          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3681          }          }
3682    
# Line 2781  for (;; ptr++) Line 3684  for (;; ptr++)
3684        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3685        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3686        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3687        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3688          replicate entries on the forward reference list. */
3689    
3690        if (repeat_max >= 0)        if (repeat_max >= 0)
3691          {          {
3692          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3693            just adjust the length as if we had. For each repetition we must add 1
3694            to the length for BRAZERO and for all but the last repetition we must
3695            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3696            paranoid checks to avoid integer overflow. */
3697    
3698            if (lengthptr != NULL && repeat_max > 0)
3699              {
3700              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3701                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3702              if ((double)repeat_max *
3703                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3704                      > (double)INT_MAX ||
3705                  OFLOW_MAX - *lengthptr < delta)
3706                {
3707                *errorcodeptr = ERR20;
3708                goto FAILED;
3709                }
3710              *lengthptr += delta;
3711              }
3712    
3713            /* This is compiling for real */
3714    
3715            else for (i = repeat_max - 1; i >= 0; i--)
3716            {            {
3717              uschar *hc;
3718              uschar *this_hwm = cd->hwm;
3719    
3720            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3721    
3722            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 3732  for (;; ptr++)
3732              }              }
3733    
3734            memcpy(code, previous, len);            memcpy(code, previous, len);
3735              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3736                {
3737                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3738                cd->hwm += LINK_SIZE;
3739                }
3740              save_hwm = this_hwm;
3741            code += len;            code += len;
3742            }            }
3743    
# Line 2824  for (;; ptr++) Line 3760  for (;; ptr++)
3760        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3761        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3762        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3763        correct offset was computed above. */        correct offset was computed above.
3764    
3765          Then, when we are doing the actual compile phase, check to see whether
3766          this group is a non-atomic one that could match an empty string. If so,
3767          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3768          that runtime checking can be done. [This check is also applied to
3769          atomic groups at runtime, but in a different way.] */
3770    
3771        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3772            {
3773            uschar *ketcode = code - ketoffset;
3774            uschar *bracode = ketcode - GET(ketcode, 1);
3775            *ketcode = OP_KETRMAX + repeat_type;
3776            if (lengthptr == NULL && *bracode != OP_ONCE)
3777              {
3778              uschar *scode = bracode;
3779              do
3780                {
3781                if (could_be_empty_branch(scode, ketcode, utf8))
3782                  {
3783                  *bracode += OP_SBRA - OP_BRA;
3784                  break;
3785                  }
3786                scode += GET(scode, 1);
3787                }
3788              while (*scode == OP_ALT);
3789              }
3790            }
3791        }        }
3792    
3793      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 3798  for (;; ptr++)
3798        goto FAILED;        goto FAILED;
3799        }        }
3800    
3801      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3802      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3803      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3804      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3805      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3806        but the special opcodes can optimize it a bit. The repeated item starts at
3807        tempcode, not at previous, which might be the first part of a string whose
3808        (former) last char we repeated.
3809    
3810        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3811        an 'upto' may follow. We skip over an 'exact' item, and then test the
3812        length of what remains before proceeding. */
3813    
3814      if (possessive_quantifier)      if (possessive_quantifier)
3815        {        {
3816        int len = code - tempcode;        int len;
3817        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3818        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3819        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3820        tempcode[0] = OP_ONCE;        len = code - tempcode;
3821        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3822        PUTINC(code, 0, len);          {
3823        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3824            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3825            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3826            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3827    
3828            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3829            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3830            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3831            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3832    
3833            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3834            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3835            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3836            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3837    
3838            default:
3839            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3840            code += 1 + LINK_SIZE;
3841            len += 1 + LINK_SIZE;
3842            tempcode[0] = OP_ONCE;
3843            *code++ = OP_KET;
3844            PUTINC(code, 0, len);
3845            PUT(tempcode, 1, len);
3846            break;
3847            }
3848        }        }
3849    
3850      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 3857  for (;; ptr++)
3857      break;      break;
3858    
3859    
3860      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3861      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3862      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3863      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3864      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3865      check for syntax errors here.  */      group. */
3866    
3867      case '(':      case '(':
3868      newoptions = options;      newoptions = options;
3869      skipbytes = 0;      skipbytes = 0;
3870        bravalue = OP_CBRA;
3871        save_hwm = cd->hwm;
3872        reset_bracount = FALSE;
3873    
3874      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3875        {        {
3876        int set, unset;        int i, set, unset, namelen;
3877        int *optset;        int *optset;
3878          const uschar *name;
3879          uschar *slot;
3880    
3881        switch (*(++ptr))        switch (*(++ptr))
3882          {          {
3883          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3884          ptr++;          ptr++;
3885          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3886            if (*ptr == 0)
3887              {
3888              *errorcodeptr = ERR18;
3889              goto FAILED;
3890              }
3891          continue;          continue;
3892    
3893          case ':':                 /* Non-extracting bracket */  
3894            /* ------------------------------------------------------------ */
3895            case '|':                 /* Reset capture count for each branch */
3896            reset_bracount = TRUE;
3897            /* Fall through */
3898    
3899            /* ------------------------------------------------------------ */
3900            case ':':                 /* Non-capturing bracket */
3901          bravalue = OP_BRA;          bravalue = OP_BRA;
3902          ptr++;          ptr++;
3903          break;          break;
3904    
3905    
3906            /* ------------------------------------------------------------ */
3907          case '(':          case '(':
3908          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3909    
3910          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
3911          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
3912          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
3913          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
3914          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
3915          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3916          character after (?( is a word character, we know the rest up to ) will  
3917          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
3918          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
3919            by digits), and (b) a number could be a name that consists of digits.
3920          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
3921            {          cases. */
3922            int i, namelen;  
3923            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
3924            const uschar *name;          the switch. This will take control down to where bracketed groups,
3925            uschar *slot = cd->name_table;          including assertions, are processed. */
3926    
3927            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3928              break;
3929    
3930            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
3931            below), and all need to skip 3 bytes at the start of the group. */
3932    
3933            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
3934            skipbytes = 3;
3935            refsign = -1;
3936    
3937            name = ++ptr;          /* Check for a test for recursion in a named group. */
           while (*ptr != ')')  
             {  
             if (condref >= 0)  
               condref = ((digitab[*ptr] & ctype_digit) != 0)?  
                 condref * 10 + *ptr - '0' : -1;  
             ptr++;  
             }  
           namelen = ptr - name;  
           ptr++;  
3938    
3939            for (i = 0; i < cd->names_found; i++)          if (ptr[1] == 'R' && ptr[2] == '&')
3940              {            {
3941              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            terminator = -1;
3942              slot += cd->name_entry_size;            ptr += 2;
3943              }            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3944              }
3945    
3946            /* Found a previous named subpattern */          /* Check for a test for a named group's having been set, using the Perl
3947            syntax (?(<name>) or (?('name') */
3948    
3949            if (i < cd->names_found)          else if (ptr[1] == '<')
3950              {            {
3951              condref = GET2(slot, 0);            terminator = '>';
3952              code[1+LINK_SIZE] = OP_CREF;            ptr++;
3953              PUT2(code, 2+LINK_SIZE, condref);            }
3954              }          else if (ptr[1] == '\'')
3955              {
3956              terminator = '\'';
3957              ptr++;
3958              }
3959            else
3960              {
3961              terminator = 0;
3962              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3963              }
3964    
3965            /* Search the pattern for a forward reference */          /* We now expect to read a name; any thing else is an error */
3966    
3967            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3968              {            {
3969              code[1+LINK_SIZE] = OP_CREF;            ptr += 1;  /* To get the right offset */
3970              PUT2(code, 2+LINK_SIZE, i);            *errorcodeptr = ERR28;
3971              }            goto FAILED;
3972              }
3973    
3974            /* Check for 'R' for recursion */          /* Read the name, but also get it as a number if it's all digits */
3975    
3976            else if (namelen == 1 && *name == 'R')          recno = 0;
3977              {          name = ++ptr;
3978              code[1+LINK_SIZE] = OP_CREF;          while ((cd->ctypes[*ptr] & ctype_word) != 0)
3979              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            {
3980              }            if (recno >= 0)
3981                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3982                  recno * 10 + *ptr - '0' : -1;
3983              ptr++;
3984              }
3985            namelen = ptr - name;
3986    
3987            /* Check for a subpattern number */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3988              {
3989              ptr--;      /* Error offset */
3990              *errorcodeptr = ERR26;
3991              goto FAILED;
3992              }
3993    
3994            else if (condref > 0)          /* Do no further checking in the pre-compile phase. */
3995    
3996            if (lengthptr != NULL) break;
3997    
3998            /* In the real compile we do the work of looking for the actual
3999            reference. If the string started with "+" or "-" we require the rest to
4000            be digits, in which case recno will be set. */
4001    
4002            if (refsign > 0)
4003              {
4004              if (recno <= 0)
4005              {              {
4006              code[1+LINK_SIZE] = OP_CREF;              *errorcodeptr = ERR58;
4007              PUT2(code, 2+LINK_SIZE, condref);              goto FAILED;
4008                }
4009              if (refsign == '-')
4010                {
4011                recno = cd->bracount - recno + 1;
4012                if (recno <= 0)
4013                  {
4014                  *errorcodeptr = ERR15;
4015                  goto FAILED;
4016                  }
4017              }              }
4018              else recno += cd->bracount;
4019              PUT2(code, 2+LINK_SIZE, recno);
4020              break;
4021              }
4022    
4023            /* Either an unidentified subpattern, or a reference to (?(0) */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4024            name. */
4025    
4026            else          slot = cd->name_table;
4027            for (i = 0; i < cd->names_found; i++)
4028              {
4029              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4030              slot += cd->name_entry_size;
4031              }
4032    
4033            /* Found a previous named subpattern */
4034    
4035            if (i < cd->names_found)
4036              {
4037              recno = GET2(slot, 0);
4038              PUT2(code, 2+LINK_SIZE, recno);
4039              }
4040    
4041            /* Search the pattern for a forward reference */
4042    
4043            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4044                            (options & PCRE_EXTENDED) != 0)) > 0)
4045              {
4046              PUT2(code, 2+LINK_SIZE, i);
4047              }
4048    
4049            /* If terminator == 0 it means that the name followed directly after
4050            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4051            some further alternatives to try. For the cases where terminator != 0
4052            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4053            now checked all the possibilities, so give an error. */
4054    
4055            else if (terminator != 0)
4056              {
4057              *errorcodeptr = ERR15;
4058              goto FAILED;
4059              }
4060    
4061            /* Check for (?(R) for recursion. Allow digits after R to specify a
4062            specific group number. */
4063    
4064            else if (*name == 'R')
4065              {
4066              recno = 0;
4067              for (i = 1; i < namelen; i++)
4068              {              {
4069              *errorcodeptr = (condref == 0)? ERR35: ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
4070              goto FAILED;                {
4071                  *errorcodeptr = ERR15;
4072                  goto FAILED;
4073                  }
4074                recno = recno * 10 + name[i] - '0';
4075              }              }
4076              if (recno == 0) recno = RREF_ANY;
4077              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4078              PUT2(code, 2+LINK_SIZE, recno);
4079              }
4080    
4081            /* Similarly, check for the (?(DEFINE) "condition", which is always
4082            false. */
4083    
4084            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4085              {
4086              code[1+LINK_SIZE] = OP_DEF;
4087              skipbytes = 1;
4088            }            }
4089    
4090          /* For conditions that are assertions, we just fall through, having          /* Check for the "name" actually being a subpattern number. */
         set bravalue above. */  
4091    
4092            else if (recno > 0)
4093              {
4094              PUT2(code, 2+LINK_SIZE, recno);
4095              }
4096    
4097            /* Either an unidentified subpattern, or a reference to (?(0) */
4098    
4099            else
4100              {
4101              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4102              goto FAILED;
4103              }
4104          break;          break;
4105    
4106    
4107            /* ------------------------------------------------------------ */
4108          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
4109          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
4110          ptr++;          ptr++;
4111          break;          break;
4112    
4113    
4114            /* ------------------------------------------------------------ */
4115          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
4116          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
4117          ptr++;          ptr++;
4118          break;          break;
4119    
4120          case '<':                 /* Lookbehinds */  
4121          switch (*(++ptr))          /* ------------------------------------------------------------ */
4122            case '<':                 /* Lookbehind or named define */
4123            switch (ptr[1])
4124            {            {
4125            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
4126            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
4127            ptr++;            ptr += 2;
4128            break;            break;
4129    
4130            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
4131            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
4132            ptr++;            ptr += 2;
4133            break;            break;
4134    
4135              default:                /* Could be name define, else bad */
4136              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4137              ptr++;                  /* Correct offset for error */
4138              *errorcodeptr = ERR24;
4139              goto FAILED;
4140            }            }
4141          break;          break;
4142    
4143    
4144            /* ------------------------------------------------------------ */
4145          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
4146          bravalue = OP_ONCE;          bravalue = OP_ONCE;
4147          ptr++;          ptr++;
4148          break;          break;
4149    
4150    
4151            /* ------------------------------------------------------------ */
4152          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
4153          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4154          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4155          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4156            {                       /* closing parenthesis is present. */            {
4157            int n = 0;            int n = 0;
4158            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4159              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4160              if (*ptr != ')')
4161                {
4162                *errorcodeptr = ERR39;
4163                goto FAILED;
4164                }
4165            if (n > 255)            if (n > 255)
4166              {              {
4167              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 4175  for (;; ptr++)
4175          previous = NULL;          previous = NULL;
4176          continue;          continue;
4177    
4178          case 'P':                 /* Named subpattern handling */  
4179          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4180            case 'P':                 /* Python-style named subpattern handling */
4181            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4182              {
4183              is_recurse = *ptr == '>';
4184              terminator = ')';
4185              goto NAMED_REF_OR_RECURSE;
4186              }
4187            else if (*ptr != '<')    /* Test for Python-style definition */
4188              {
4189              *errorcodeptr = ERR41;
4190              goto FAILED;
4191              }
4192            /* Fall through to handle (?P< as (?< is handled */
4193    
4194    
4195            /* ------------------------------------------------------------ */
4196            DEFINE_NAME:    /* Come here from (?< handling */
4197            case '\'':
4198            {            {
4199            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4200            uschar *slot = cd->name_table;            name = ++ptr;
4201            const uschar *name;     /* Don't amalgamate; some compilers */  
4202            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4203              namelen = ptr - name;
4204    
4205            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4206    
4207            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4208                {
4209                if (*ptr != terminator)
4210                  {
4211                  *errorcodeptr = ERR42;
4212                  goto FAILED;
4213                  }
4214                if (cd->names_found >= MAX_NAME_COUNT)
4215                  {
4216                  *errorcodeptr = ERR49;
4217                  goto FAILED;
4218                  }
4219                if (namelen + 3 > cd->name_entry_size)
4220                  {
4221                  cd->name_entry_size = namelen + 3;
4222                  if (namelen > MAX_NAME_SIZE)
4223                    {
4224                    *errorcodeptr = ERR48;
4225                    goto FAILED;
4226                    }
4227                  }
4228                }
4229    
4230              /* In the real compile, create the entry in the table */
4231    
4232              else
4233              {              {
4234              int crc = memcmp(name, slot+2, namelen);              slot = cd->name_table;
4235              if (crc == 0)              for (i = 0; i < cd->names_found; i++)
4236                {                {
4237                if (slot[2+namelen] == 0)                int crc = memcmp(name, slot+2, namelen);
4238                  if (crc == 0)
4239                  {                  {
4240                  if ((options & PCRE_DUPNAMES) == 0)                  if (slot[2+namelen] == 0)
4241                    {                    {
4242                    *errorcodeptr = ERR43;                    if ((options & PCRE_DUPNAMES) == 0)
4243                    goto FAILED;                      {
4244                        *errorcodeptr = ERR43;
4245                        goto FAILED;
4246                        }
4247                    }                    }
4248                    else crc = -1;      /* Current name is substring */
4249                  }                  }
4250                else crc = -1;      /* Current name is substring */                if (crc < 0)
4251                }                  {
4252              if (crc < 0)                  memmove(slot + cd->name_entry_size, slot,
4253                {                    (cd->names_found - i) * cd->name_entry_size);
4254                memmove(slot + cd->name_entry_size, slot,                  break;
4255                  (cd->names_found - i) * cd->name_entry_size);                  }
4256                break;                slot += cd->name_entry_size;
4257                }                }
             slot += cd->name_entry_size;  
             }  
4258    
4259            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4260            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4261            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4262            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4263            }            }
4264    
4265          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4266    
4267            ptr++;                    /* Move past > or ' */
4268            cd->names_found++;
4269            goto NUMBERED_GROUP;
4270    
4271    
4272            /* ------------------------------------------------------------ */
4273            case '&':                 /* Perl recursion/subroutine syntax */
4274            terminator = ')';
4275            is_recurse = TRUE;
4276            /* Fall through */
4277    
4278            /* We come here from the Python syntax above that handles both
4279            references (?P=name) and recursion (?P>name), as well as falling
4280            through from the Perl recursion syntax (?&name). */
4281    
4282            NAMED_REF_OR_RECURSE:
4283            name = ++ptr;
4284            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285            namelen = ptr - name;
4286    
4287            /* In the pre-compile phase, do a syntax check and set a dummy
4288            reference number. */
4289    
4290            if (lengthptr != NULL)
4291            {            {
4292            int i, namelen;            if (*ptr != terminator)
4293            int type = *ptr++;              {
4294            const uschar *name = ptr;              *errorcodeptr = ERR42;
4295            uschar *slot = cd->name_table;              goto FAILED;
4296                }
4297              if (namelen > MAX_NAME_SIZE)
4298                {
4299                *errorcodeptr = ERR48;
4300                goto FAILED;
4301                }
4302              recno = 0;
4303              }
4304    
4305            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4306    
4307            else
4308              {
4309              slot = cd->name_table;
4310            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4311              {              {
4312              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
# Line 3097  for (;; ptr++) Line 4318  for (;; ptr++)
4318              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4319              }              }
4320            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4321                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
4322                          (options & PCRE_EXTENDED) != 0)) <= 0)
4323              {              {
4324              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4325              goto FAILED;              goto FAILED;
4326              }              }
4327              }
4328    
4329            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4330            recursion or backreferences. */
           /* Back reference */  
4331    
4332            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4333            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4334    
         /* Should never happen */  
         break;  
4335    
4336          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4337            case 'R':                 /* Recursion */
4338          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4339          /* Fall through */          /* Fall through */
4340    
         /* Recursion or "subroutine" call */  
4341    
4342          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4343          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4344            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4345            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4346            {            {
4347            const uschar *called;            const uschar *called;
4348    
4349              if ((refsign = *ptr) == '+') ptr++;
4350              else if (refsign == '-')
4351                {
4352                if ((digitab[ptr[1]] & ctype_digit) == 0)
4353                  goto OTHER_CHAR_AFTER_QUERY;
4354                ptr++;
4355                }
4356    
4357            recno = 0;            recno = 0;
4358            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4359              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4360    
4361              if (*ptr != ')')
4362                {
4363                *errorcodeptr = ERR29;
4364                goto FAILED;
4365                }
4366    
4367              if (refsign == '-')
4368                {
4369                if (recno == 0)
4370                  {
4371                  *errorcodeptr = ERR58;
4372                  goto FAILED;
4373                  }
4374                recno = cd->bracount - recno + 1;
4375                if (recno <= 0)
4376                  {
4377                  *errorcodeptr = ERR15;
4378                  goto FAILED;
4379                  }
4380                }
4381              else if (refsign == '+')
4382                {
4383                if (recno == 0)
4384                  {
4385                  *errorcodeptr = ERR58;
4386                  goto FAILED;
4387                  }
4388                recno += cd->bracount;
4389                }
4390    
4391            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4392    
4393            HANDLE_RECURSION:            HANDLE_RECURSION:
4394    
4395            previous = code;            previous = code;
4396              called = cd->start_code;
4397    
4398            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4399            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4400              this point. If we end up with a forward reference, first check that
4401              the bracket does occur later so we can give the error (and position)
4402              now. Then remember this forward reference in the workspace so it can
4403              be filled in at the end. */
4404    
4405            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4406              {              {
4407              *errorcodeptr = ERR15;              *code = OP_END;
4408              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4409    
4410            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4411    
4412            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4413              {                {
4414              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4415              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4416                    {
4417                    *errorcodeptr = ERR15;
4418                    goto FAILED;
4419                    }
4420                  called = cd->start_code + recno;
4421                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4422                  }
4423    
4424                /* If not a forward reference, and the subpattern is still open,
4425                this is a recursive call. We check to see if this is a left
4426                recursion that could loop for ever, and diagnose that case. */
4427    
4428                else if (GET(called, 1) == 0 &&
4429                         could_be_empty(called, code, bcptr, utf8))
4430                  {
4431                  *errorcodeptr = ERR40;
4432                  goto FAILED;
4433                  }
4434              }              }
4435    
4436            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4437            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4438              subsequent quantifier will work. */
4439    
4440            *code = OP_ONCE;            *code = OP_ONCE;
4441            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3174  for (;; ptr++) Line 4448  for (;; ptr++)
4448            *code = OP_KET;            *code = OP_KET;
4449            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4450            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4451    
4452              length_prevgroup = 3 + 3*LINK_SIZE;
4453            }            }
4454    
4455            /* Can't determine a first byte now */
4456    
4457            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4458          continue;          continue;
4459    
         /* Character after (? not specially recognized */  
4460    
4461          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4462            default:              /* Other characters: check option setting */
4463            OTHER_CHAR_AFTER_QUERY:
4464          set = unset = 0;          set = unset = 0;
4465          optset = &set;          optset = &set;
4466    
# Line 3189  for (;; ptr++) Line 4470  for (;; ptr++)
4470              {              {
4471              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4472    
4473                case 'J':    /* Record that it changed in the external options */
4474                *optset |= PCRE_DUPNAMES;
4475                cd->external_options |= PCRE_JCHANGED;
4476                break;
4477    
4478              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
             case 'J': *optset |= PCRE_DUPNAMES; break;  
4479              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4480              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4481              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4482              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4483              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4484    
4485                default:  *errorcodeptr = ERR12;
4486                          ptr--;    /* Correct the offset */
4487                          goto FAILED;
4488              }              }
4489            }            }
4490    
# Line 3204  for (;; ptr++) Line 4493  for (;; ptr++)
4493          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4494    
4495          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4496          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4497          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4498          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4499          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4500          a group), a resetting item can be compiled.          caseless checking of required bytes.
4501    
4502          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4503          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4504          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4505            that value after the start, because it gets reset as code is discarded
4506            during the pre-compile. However, this can happen only at top level - if
4507            we are within parentheses, the starting BRA will still be present. At
4508            any parenthesis level, the length value can be used to test if anything
4509            has been compiled at that level. Thus, a test for both these conditions
4510            is necessary to ensure we correctly detect the start of the pattern in
4511            both phases.
4512    
4513            If we are not at the pattern start, compile code to change the ims
4514            options if this setting actually changes any of them. We also pass the
4515            new setting back so that it can be put at the start of any following
4516            branches, and when this group ends (if we are in a group), a resetting
4517            item can be compiled. */
4518    
4519          if (*ptr == ')')          if (*ptr == ')')
4520            {            {
4521            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4522                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4523              {              {
4524              *code++ = OP_OPT;              cd->external_options = newoptions;
4525              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4526              }              }
4527             else
4528                {
4529                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4530                  {
4531                  *code++ = OP_OPT;
4532                  *code++ = newoptions & PCRE_IMS;
4533                  }
4534    
4535            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4536            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4537            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4538    
4539            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4540            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4541            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4542            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4543                }
4544    
4545            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4546            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3242  for (;; ptr++) Line 4553  for (;; ptr++)
4553    
4554          bravalue = OP_BRA;          bravalue = OP_BRA;
4555          ptr++;          ptr++;
4556          }          }     /* End of switch for character following (? */
4557        }        }       /* End of (? handling */
4558    
4559      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4560      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4561        brackets. */
4562    
4563      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4564        {        {
4565        bravalue = OP_BRA;        bravalue = OP_BRA;
4566        }        }
4567    
4568      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4569    
4570      else      else
4571        {        {
4572        NUMBERED_GROUP:        NUMBERED_GROUP:
4573        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4574          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4575          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4576        }        }
4577    
4578      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4579      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4580      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4581      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4582        they have changed. */
4583    
4584      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4585      *code = bravalue;      *code = bravalue;
4586      tempcode = code;      tempcode = code;
4587      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4588        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4589    
4590      if (!compile_regex(      if (!compile_regex(
4591           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4592           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4593           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4594           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4595           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4596           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4597            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4598           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4599             skipbytes,                    /* Skip over bracket number */
4600           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4601           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4602           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4603           cd))                          /* Tables block */           cd,                           /* Tables block */
4604             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4605               &length_prevgroup           /* Pre-compile phase */
4606             ))
4607        goto FAILED;        goto FAILED;
4608    
4609      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3302  for (;; ptr++) Line 4612  for (;; ptr++)
4612      is on the bracket. */      is on the bracket. */
4613    
4614      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4615      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4616        in the real compile phase, not in the pre-pass, where the whole group may
4617        not be available. */
4618    
4619      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4620        {        {
4621        uschar *tc = code;        uschar *tc = code;
4622        int condcount = 0;        int condcount = 0;
# Line 3315  for (;; ptr++) Line 4627  for (;; ptr++)
4627           }           }
4628        while (*tc != OP_KET);        while (*tc != OP_KET);
4629    
4630        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4631          false). It must have only one branch. */
4632    
4633          if (code[LINK_SIZE+1] == OP_DEF)
4634          {          {
4635          *errorcodeptr = ERR27;          if (condcount > 1)
4636          goto FAILED;            {
4637              *errorcodeptr = ERR54;
4638              goto FAILED;
4639              }
4640            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4641            }
4642    
4643          /* A "normal" conditional group. If there is just one branch, we must not
4644          make use of its firstbyte or reqbyte, because this is equivalent to an
4645          empty second branch. */
4646    
4647          else
4648            {
4649            if (condcount > 2)
4650              {
4651              *errorcodeptr = ERR27;
4652              goto FAILED;
4653              }
4654            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4655          }          }
4656          }
4657    
4658        /* Error if hit end of pattern */
4659    
4660        if (*ptr != ')')
4661          {
4662          *errorcodeptr = ERR14;
4663          goto FAILED;
4664          }
4665    
4666        /* If there is just one branch, we must not make use of its firstbyte or      /* In the pre-compile phase, update the length by the length of the nested
4667        reqbyte, because this is equivalent to an empty second branch. */      group, less the brackets at either end. Then reduce the compiled code to
4668        just the brackets so that it doesn't use much memory if it is duplicated by
4669        a quantifier. */
4670    
4671        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      if (lengthptr != NULL)
4672          {
4673          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4674            {
4675            *errorcodeptr = ERR20;
4676            goto FAILED;
4677            }
4678          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4679          code++;
4680          PUTINC(code, 0, 1 + LINK_SIZE);
4681          *code++ = OP_KET;
4682          PUTINC(code, 0, 1 + LINK_SIZE);
4683        }        }
4684    
4685      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4686      brackets of all kinds, and conditions with two branches (see code above).  
4687      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4688      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4689      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4690        relevant. */
4691    
4692        if (bravalue == OP_DEF) break;
4693    
4694        /* Handle updating of the required and first characters for other types of
4695        group. Update for normal brackets of all kinds, and conditions with two
4696        branches (see code above). If the bracket is followed by a quantifier with
4697        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4698        zerofirstbyte outside the main loop so that they can be accessed for the
4699        back off. */
4700    
4701      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4702      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4703      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4704    
4705      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4706        {        {
4707        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4708        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3378  for (;; ptr++) Line 4743  for (;; ptr++)
4743      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4744    
4745      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4746        break;     /* End of processing '(' */
4747    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4748    
4749      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4750        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4751      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4752      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4753      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4754      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4755      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4756    
4757        case '\\':
4758        tempptr = ptr;
4759        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4760        if (*errorcodeptr != 0) goto FAILED;
4761    
4762      if (c < 0)      if (c < 0)
4763        {        {
4764        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3416  for (;; ptr++) Line 4768  for (;; ptr++)
4768          continue;          continue;
4769          }          }
4770    
4771          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4772    
4773        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4774        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4775    
# Line 3427  for (;; ptr++) Line 4781  for (;; ptr++)
4781        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4782        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4783    
4784        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4785          We also support \k{name} (.NET syntax) */
4786    
4787          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4788            {
4789            is_recurse = FALSE;
4790            terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4791            goto NAMED_REF_OR_RECURSE;
4792            }
4793    
4794          /* Back references are handled specially; must disable firstbyte if
4795          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4796          ':' later. */
4797    
4798        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4799          {          {
4800          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4801    
4802            HANDLE_REFERENCE:    /* Come here from named backref handling */
4803            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4804          previous = code;          previous = code;
4805          *code++ = OP_REF;          *code++ = OP_REF;
4806          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4807            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4808            if (recno > cd->top_backref) cd->top_backref = recno;
4809          }          }
4810    
4811        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4812    
4813  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4814        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3446  for (;; ptr++) Line 4816  for (;; ptr++)
4816          BOOL negated;          BOOL negated;
4817          int pdata;          int pdata;
4818          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4819            if (ptype < 0) goto FAILED;
4820          previous = code;          previous = code;
4821          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4822          *code++ = ptype;          *code++ = ptype;
4823          *code++ = pdata;          *code++ = pdata;
4824          }          }
4825    #else
4826    
4827          /* If Unicode properties are not supported, \X, \P, and \p are not
4828          allowed. */
4829    
4830          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4831            {
4832            *errorcodeptr = ERR45;
4833            goto FAILED;
4834            }
4835  #endif  #endif
4836    
4837        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4838        value */        can obtain the OP value by negating the escape value. */
4839    
4840        else        else
4841          {          {
# Line 3478  for (;; ptr++) Line 4859  for (;; ptr++)
4859       mcbuffer[0] = c;       mcbuffer[0] = c;
4860       mclength = 1;       mclength = 1;
4861       }       }
   
4862      goto ONE_CHAR;      goto ONE_CHAR;
4863    
4864    
4865        /* ===================================================================*/
4866      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4867      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4868      multi-byte literal character. */      multi-byte literal character. */
# Line 3491  for (;; ptr++) Line 4873  for (;; ptr++)
4873      mcbuffer[0] = c;      mcbuffer[0] = c;
4874    
4875  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4876      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4877        {        {
4878        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4879          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3542  for (;; ptr++) Line 4924  for (;; ptr++)
4924      }      }
4925    }                   /* end of big loop */    }                   /* end of big loop */
4926    
4927    
4928  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4929  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4930  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3558  return FALSE; Line 4941  return FALSE;
4941  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4942  *************************************************/  *************************************************/
4943    
4944  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4945  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4946  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4947  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4948  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4949  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4950  the new options into every subsequent branch compile.  into every subsequent branch compile.
4951    
4952    This function is used during the pre-compile phase when we are trying to find
4953    out the amount of memory needed, as well as during the real compile phase. The
4954    value of lengthptr distinguishes the two phases.
4955    
4956  Argument:  Arguments:
4957    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4958    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4959    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4960    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4961    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4962    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4963    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    reset_bracount TRUE to reset the count for each branch
4964      skipbytes      skip this many bytes at start (for brackets and OP_COND)
4965    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number