/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 170 by ph10, Mon Jun 4 11:21:13 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 53  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 106  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 116  static const short int escapes[] = { Line 126  static const short int escapes[] = {
126    
127    
128  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
129  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
130  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
131    
132  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 127  static const char *const posix_names[] = Line 137  static const char *const posix_names[] =
137  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
138    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
141  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
142  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
143    characters are removed, and for [:alpha:] and [:alnum:] the underscore
144    character is removed. The triples in the table consist of the base map offset,
145    second map offset or -1 if no second map, and a non-negative value for map
146    addition or a negative value for map subtraction (if there are two maps). The
147    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148    remove vertical space characters, 2 => remove underscore. */
149    
150  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
151    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
152    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
153    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
154    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
155    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
156    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
157    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
158    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
159    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
160    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
161    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
162    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
163    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
164    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 165  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 175  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 184  static const char *error_texts[] = { Line 205  static const char *error_texts[] = {
205    "unrecognized character after (?<",    "unrecognized character after (?<",
206    /* 25 */    /* 25 */
207    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
208    "malformed number after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 203  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named groups have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
234      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236      /* 50 */
237      "repeated subpattern is too long",
238      "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by an (optionally braced) non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 229  For convenience, we use the same bit def Line 263  For convenience, we use the same bit def
263    
264  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
267  static const unsigned char digitab[] =  static const unsigned char digitab[] =
268    {    {
269    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 265  static const unsigned char digitab[] = Line 299  static const unsigned char digitab[] =
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
303  static const unsigned char digitab[] =  static const unsigned char digitab[] =
304    {    {
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 279  static const unsigned char digitab[] = Line 313  static const unsigned char digitab[] =
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 313  static const unsigned char ebcdic_charta Line 347  static const unsigned char ebcdic_charta
347    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
348    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
349    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
350    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
351    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
352    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 340  static const unsigned char ebcdic_charta Line 374  static const unsigned char ebcdic_charta
374  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
375    
376  static BOOL  static BOOL
377    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
379    
380    
381    
# Line 351  static BOOL Line 385  static BOOL
385    
386  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
387  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
388  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
389  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391    ptr is pointing at the \. On exit, it is on the final character of the escape
392    sequence.
393    
394  Arguments:  Arguments:
395    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 371  static int Line 407  static int
407  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408    int options, BOOL isclass)    int options, BOOL isclass)
409  {  {
410  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
411    const uschar *ptr = *ptrptr + 1;
412  int c, i;  int c, i;
413    
414    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
415    ptr--;                            /* Set pointer back to the last byte */
416    
417  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
418    
 c = *(++ptr);  
419  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
420    
421  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
423  Otherwise further processing may be required. */  Otherwise further processing may be required. */
424    
425  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
426  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
427  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
428    
429  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
430  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
431  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
432  #endif  #endif
# Line 397  else if ((i = escapes[c - 0x48]) != 0) Line 436  else if ((i = escapes[c - 0x48]) != 0)
436  else  else
437    {    {
438    const uschar *oldptr;    const uschar *oldptr;
439      BOOL braced, negated;
440    
441    switch (c)    switch (c)
442      {      {
443      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 410  else Line 451  else
451      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
452      break;      break;
453    
454        /* \g must be followed by a number, either plain or braced. If positive, it
455        is an absolute backreference. If negative, it is a relative backreference.
456        This is a Perl 5.10 feature. */
457    
458        case 'g':
459        if (ptr[1] == '{')
460          {
461          braced = TRUE;
462          ptr++;
463          }
464        else braced = FALSE;
465    
466        if (ptr[1] == '-')
467          {
468          negated = TRUE;
469          ptr++;
470          }
471        else negated = FALSE;
472    
473        c = 0;
474        while ((digitab[ptr[1]] & ctype_digit) != 0)
475          c = c * 10 + *(++ptr) - '0';
476    
477        if (c == 0 || (braced && *(++ptr) != '}'))
478          {
479          *errorcodeptr = ERR57;
480          return 0;
481          }
482    
483        if (negated)
484          {
485          if (c > bracount)
486            {
487            *errorcodeptr = ERR15;
488            return 0;
489            }
490          c = bracount - (c - 1);
491          }
492    
493        c = -(ESC_REF + c);
494        break;
495    
496      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
497      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
498      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 451  else Line 534  else
534        }        }
535    
536      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
537      larger first octal digit. */      larger first octal digit. The original code used just to take the least
538        significant 8 bits of octal numbers (I think this is what early Perls used
539        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
540        than 3 octal digits. */
541    
542      case '0':      case '0':
543      c -= '0';      c -= '0';
544      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
545          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
546      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
547      break;      break;
548    
549      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
550      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
551        treated as a data character. */
552    
553      case 'x':      case 'x':
554  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
555        {        {
556        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
557        register int count = 0;        int count = 0;
558    
559        c = 0;        c = 0;
560        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
561          {          {
562          int cc = *pt++;          register int cc = *pt++;
563            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
564          count++;          count++;
565  #if !EBCDIC    /* ASCII coding */  
566    #ifndef EBCDIC  /* ASCII coding */
567          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
568          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
569  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
570          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
571          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
572  #endif  #endif
573          }          }
574    
575        if (*pt == '}')        if (*pt == '}')
576          {          {
577          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
578          ptr = pt;          ptr = pt;
579          break;          break;
580          }          }
581    
582        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
583        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
584        }        }
 #endif  
585    
586      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
587    
588      c = 0;      c = 0;
589      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
590        {        {
591        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
592        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
593  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
594        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
595        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
596  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
597        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
598        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
599  #endif  #endif
600        }        }
601      break;      break;
602    
603      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
604        This coding is ASCII-specific, but then the whole concept of \cx is
605        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
606    
607      case 'c':      case 'c':
608      c = *(++ptr);      c = *(++ptr);
# Line 520  else Line 612  else
612        return 0;        return 0;
613        }        }
614    
615      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
616      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
617      c ^= 0x40;      c ^= 0x40;
618  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
619      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
620      c ^= 0xC0;      c ^= 0xC0;
621  #endif  #endif
# Line 569  escape sequence. Line 657  escape sequence.
657  Argument:  Argument:
658    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
659    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
660      dptr           points to an int that is set to the detailed property value
661    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
662    
663  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
664  */  */
665    
666  static int  static int
667  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
668  {  {
669  int c, i, bot, top;  int c, i, bot, top;
670  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
671  char name[4];  char name[32];
672    
673  c = *(++ptr);  c = *(++ptr);
674  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
675    
676  *negptr = FALSE;  *negptr = FALSE;
677    
678  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
679  preceded by ^ for negation. */  negation. */
680    
681  if (c == '{')  if (c == '{')
682    {    {
# Line 596  if (c == '{') Line 685  if (c == '{')
685      *negptr = TRUE;      *negptr = TRUE;
686      ptr++;      ptr++;
687      }      }
688    for (i = 0; i <= 2; i++)    for (i = 0; i < sizeof(name) - 1; i++)
689      {      {
690      c = *(++ptr);      c = *(++ptr);
691      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
692      if (c == '}') break;      if (c == '}') break;
693      name[i] = c;      name[i] = c;
694      }      }
695    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
696    name[i] = 0;    name[i] = 0;
697    }    }
698    
# Line 628  top = _pcre_utt_size; Line 713  top = _pcre_utt_size;
713    
714  while (bot < top)  while (bot < top)
715    {    {
716    i = (bot + top)/2;    i = (bot + top) >> 1;
717    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
718    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
719        {
720        *dptr = _pcre_utt[i].value;
721        return _pcre_utt[i].type;
722        }
723    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
724    }    }
725    
 UNKNOWN_RETURN:  
726  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
727  *ptrptr = ptr;  *ptrptr = ptr;
728  return -1;  return -1;
# Line 750  return p; Line 838  return p;
838    
839    
840  /*************************************************  /*************************************************
841    *       Find forward referenced subpattern       *
842    *************************************************/
843    
844    /* This function scans along a pattern's text looking for capturing
845    subpatterns, and counting them. If it finds a named pattern that matches the
846    name it is given, it returns its number. Alternatively, if the name is NULL, it
847    returns when it reaches a given numbered subpattern. This is used for forward
848    references to subpatterns. We know that if (?P< is encountered, the name will
849    be terminated by '>' because that is checked in the first pass.
850    
851    Arguments:
852      ptr          current position in the pattern
853      count        current count of capturing parens so far encountered
854      name         name to seek, or NULL if seeking a numbered subpattern
855      lorn         name length, or subpattern number if name is NULL
856      xmode        TRUE if we are in /x mode
857    
858    Returns:       the number of the named subpattern, or -1 if not found
859    */
860    
861    static int
862    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
863      BOOL xmode)
864    {
865    const uschar *thisname;
866    
867    for (; *ptr != 0; ptr++)
868      {
869      int term;
870    
871      /* Skip over backslashed characters and also entire \Q...\E */
872    
873      if (*ptr == '\\')
874        {
875        if (*(++ptr) == 0) return -1;
876        if (*ptr == 'Q') for (;;)
877          {
878          while (*(++ptr) != 0 && *ptr != '\\');
879          if (*ptr == 0) return -1;
880          if (*(++ptr) == 'E') break;
881          }
882        continue;
883        }
884    
885      /* Skip over character classes */
886    
887      if (*ptr == '[')
888        {
889        while (*(++ptr) != ']')
890          {
891          if (*ptr == '\\')
892            {
893            if (*(++ptr) == 0) return -1;
894            if (*ptr == 'Q') for (;;)
895              {
896              while (*(++ptr) != 0 && *ptr != '\\');
897              if (*ptr == 0) return -1;
898              if (*(++ptr) == 'E') break;
899              }
900            continue;
901            }
902          }
903        continue;
904        }
905    
906      /* Skip comments in /x mode */
907    
908      if (xmode && *ptr == '#')
909        {
910        while (*(++ptr) != 0 && *ptr != '\n');
911        if (*ptr == 0) return -1;
912        continue;
913        }
914    
915      /* An opening parens must now be a real metacharacter */
916    
917      if (*ptr != '(') continue;
918      if (ptr[1] != '?')
919        {
920        count++;
921        if (name == NULL && count == lorn) return count;
922        continue;
923        }
924    
925      ptr += 2;
926      if (*ptr == 'P') ptr++;                      /* Allow optional P */
927    
928      /* We have to disambiguate (?<! and (?<= from (?<name> */
929    
930      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
931           *ptr != '\'')
932        continue;
933    
934      count++;
935    
936      if (name == NULL && count == lorn) return count;
937      term = *ptr++;
938      if (term == '<') term = '>';
939      thisname = ptr;
940      while (*ptr != term) ptr++;
941      if (name != NULL && lorn == ptr - thisname &&
942          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
943        return count;
944      }
945    
946    return -1;
947    }
948    
949    
950    
951    /*************************************************
952  *      Find first significant op code            *  *      Find first significant op code            *
953  *************************************************/  *************************************************/
954    
# Line 798  for (;;) Line 997  for (;;)
997    
998      case OP_CALLOUT:      case OP_CALLOUT:
999      case OP_CREF:      case OP_CREF:
1000      case OP_BRANUMBER:      case OP_RREF:
1001        case OP_DEF:
1002      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1003      break;      break;
1004    
# Line 843  for (;;) Line 1043  for (;;)
1043    {    {
1044    int d;    int d;
1045    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1046    
1047    switch (op)    switch (op)
1048      {      {
1049        case OP_CBRA:
1050      case OP_BRA:      case OP_BRA:
1051      case OP_ONCE:      case OP_ONCE:
1052      case OP_COND:      case OP_COND:
1053      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1054      if (d < 0) return d;      if (d < 0) return d;
1055      branchlength += d;      branchlength += d;
1056      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 885  for (;;) Line 1085  for (;;)
1085      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1086    
1087      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1088      case OP_CREF:      case OP_CREF:
1089        case OP_RREF:
1090        case OP_DEF:
1091      case OP_OPT:      case OP_OPT:
1092      case OP_CALLOUT:      case OP_CALLOUT:
1093      case OP_SOD:      case OP_SOD:
# Line 904  for (;;) Line 1105  for (;;)
1105    
1106      case OP_CHAR:      case OP_CHAR:
1107      case OP_CHARNC:      case OP_CHARNC:
1108        case OP_NOT:
1109      branchlength++;      branchlength++;
1110      cc += 2;      cc += 2;
1111  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 937  for (;;) Line 1139  for (;;)
1139    
1140      case OP_PROP:      case OP_PROP:
1141      case OP_NOTPROP:      case OP_NOTPROP:
1142      cc++;      cc += 2;
1143      /* Fall through */      /* Fall through */
1144    
1145      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 1018  Returns: pointer to the opcode for Line 1220  Returns: pointer to the opcode for
1220  static const uschar *  static const uschar *
1221  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1222  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1223  for (;;)  for (;;)
1224    {    {
1225    register int c = *code;    register int c = *code;
1226    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1227    else if (c > OP_BRA)  
1228      /* XCLASS is used for classes that cannot be represented just by a bit
1229      map. This includes negated single high-valued characters. The length in
1230      the table is zero; the actual length is stored in the compiled code. */
1231    
1232      if (c == OP_XCLASS) code += GET(code, 1);
1233    
1234      /* Handle capturing bracket */
1235    
1236      else if (c == OP_CBRA)
1237      {      {
1238      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1239      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1240      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1241      }      }
1242    
1243      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1244      a multi-byte character. The length in the table is a minimum, so we have to
1245      arrange to skip the extra bytes. */
1246    
1247    else    else
1248      {      {
1249      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1250  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1251      if (utf8) switch(c)      if (utf8) switch(c)
1252        {        {
1253        case OP_CHAR:        case OP_CHAR:
# Line 1051  for (;;) Line 1255  for (;;)
1255        case OP_EXACT:        case OP_EXACT:
1256        case OP_UPTO:        case OP_UPTO:
1257        case OP_MINUPTO:        case OP_MINUPTO:
1258          case OP_POSUPTO:
1259        case OP_STAR:        case OP_STAR:
1260        case OP_MINSTAR:        case OP_MINSTAR:
1261          case OP_POSSTAR:
1262        case OP_PLUS:        case OP_PLUS:
1263        case OP_MINPLUS:        case OP_MINPLUS:
1264          case OP_POSPLUS:
1265        case OP_QUERY:        case OP_QUERY:
1266        case OP_MINQUERY:        case OP_MINQUERY:
1267        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1268        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1269        break;        break;
1270        }        }
1271  #endif  #endif
# Line 1092  Returns: pointer to the opcode for Line 1292  Returns: pointer to the opcode for
1292  static const uschar *  static const uschar *
1293  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1294  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1295  for (;;)  for (;;)
1296    {    {
1297    register int c = *code;    register int c = *code;
1298    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1299    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1300    else if (c > OP_BRA)  
1301      {    /* XCLASS is used for classes that cannot be represented just by a bit
1302      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1303      }    the table is zero; the actual length is stored in the compiled code. */
1304    
1305      if (c == OP_XCLASS) code += GET(code, 1);
1306    
1307      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1308      that are followed by a character may be followed by a multi-byte character.
1309      The length in the table is a minimum, so we have to arrange to skip the extra
1310      bytes. */
1311    
1312    else    else
1313      {      {
1314      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1315  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1316      if (utf8) switch(c)      if (utf8) switch(c)
1317        {        {
1318        case OP_CHAR:        case OP_CHAR:
# Line 1123  for (;;) Line 1320  for (;;)
1320        case OP_EXACT:        case OP_EXACT:
1321        case OP_UPTO:        case OP_UPTO:
1322        case OP_MINUPTO:        case OP_MINUPTO:
1323          case OP_POSUPTO:
1324        case OP_STAR:        case OP_STAR:
1325        case OP_MINSTAR:        case OP_MINSTAR:
1326          case OP_POSSTAR:
1327        case OP_PLUS:        case OP_PLUS:
1328        case OP_MINPLUS:        case OP_MINPLUS:
1329          case OP_POSPLUS:
1330        case OP_QUERY:        case OP_QUERY:
1331        case OP_MINQUERY:        case OP_MINQUERY:
1332        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1333        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1334        break;        break;
1335        }        }
1336  #endif  #endif
# Line 1152  for (;;) Line 1345  for (;;)
1345  *************************************************/  *************************************************/
1346    
1347  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1348  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1349  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1350  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1351  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1352    struck an inner bracket whose current branch will already have been scanned.
1353    
1354  Arguments:  Arguments:
1355    code        points to start of search    code        points to start of search
# Line 1169  static BOOL Line 1363  static BOOL
1363  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1364  {  {
1365  register int c;  register int c;
1366  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1367       code < endcode;       code < endcode;
1368       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1369    {    {
1370    const uschar *ccode;    const uschar *ccode;
1371    
1372    c = *code;    c = *code;
1373    
1374      /* Groups with zero repeats can of course be empty; skip them. */
1375    
1376      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1377        {
1378        do code += GET(code, 1); while (*code == OP_ALT);
1379        c = *code;
1380        continue;
1381        }
1382    
1383    if (c >= OP_BRA)    /* For other groups, scan the branches. */
1384    
1385      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1386      {      {
1387      BOOL empty_branch;      BOOL empty_branch;
1388      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1193  for (code = first_significant_code(code Line 1398  for (code = first_significant_code(code
1398        }        }
1399      while (*code == OP_ALT);      while (*code == OP_ALT);
1400      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1401      code += 1 + LINK_SIZE;      c = *code;
1402      c = *code;      continue;
1403      }      }
1404    
1405    else switch (c)    /* Handle the other opcodes */
1406    
1407      switch (c)
1408      {      {
1409      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1410    
# Line 1253  for (code = first_significant_code(code Line 1460  for (code = first_significant_code(code
1460      case OP_NOT:      case OP_NOT:
1461      case OP_PLUS:      case OP_PLUS:
1462      case OP_MINPLUS:      case OP_MINPLUS:
1463        case OP_POSPLUS:
1464      case OP_EXACT:      case OP_EXACT:
1465      case OP_NOTPLUS:      case OP_NOTPLUS:
1466      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1467        case OP_NOTPOSPLUS:
1468      case OP_NOTEXACT:      case OP_NOTEXACT:
1469      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1470      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1471        case OP_TYPEPOSPLUS:
1472      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1473      return FALSE;      return FALSE;
1474    
# Line 1270  for (code = first_significant_code(code Line 1480  for (code = first_significant_code(code
1480      case OP_ALT:      case OP_ALT:
1481      return TRUE;      return TRUE;
1482    
1483      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1484      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1485    
1486  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1487      case OP_STAR:      case OP_STAR:
1488      case OP_MINSTAR:      case OP_MINSTAR:
1489        case OP_POSSTAR:
1490      case OP_QUERY:      case OP_QUERY:
1491      case OP_MINQUERY:      case OP_MINQUERY:
1492        case OP_POSQUERY:
1493      case OP_UPTO:      case OP_UPTO:
1494      case OP_MINUPTO:      case OP_MINUPTO:
1495        case OP_POSUPTO:
1496      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1497      break;      break;
1498  #endif  #endif
# Line 1397  earlier groups that are outside the curr Line 1610  earlier groups that are outside the curr
1610  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1611  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1612  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1613  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1614  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1615    
1616    This function has been extended with the possibility of forward references for
1617    recursions and subroutine calls. It must also check the list of such references
1618    for the group we are dealing with. If it finds that one of the recursions in
1619    the current group is on this list, it adjusts the offset in the list, not the
1620    value in the reference (which is a group number).
1621    
1622  Arguments:  Arguments:
1623    group      points to the start of the group    group      points to the start of the group
1624    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1625    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1626    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1627      save_hwm   the hwm forward reference pointer at the start of the group
1628    
1629  Returns:     nothing  Returns:     nothing
1630  */  */
1631    
1632  static void  static void
1633  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1634      uschar *save_hwm)
1635  {  {
1636  uschar *ptr = group;  uschar *ptr = group;
1637  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1638    {    {
1639    int offset = GET(ptr, 1);    int offset;
1640    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1641    
1642      /* See if this recursion is on the forward reference list. If so, adjust the
1643      reference. */
1644    
1645      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1646        {
1647        offset = GET(hc, 0);
1648        if (cd->start_code + offset == ptr + 1)
1649          {
1650          PUT(hc, 0, offset + adjust);
1651          break;
1652          }
1653        }
1654    
1655      /* Otherwise, adjust the recursion offset if it's after the start of this
1656      group. */
1657    
1658      if (hc >= cd->hwm)
1659        {
1660        offset = GET(ptr, 1);
1661        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1662        }
1663    
1664    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1665    }    }
1666  }  }
# Line 1495  Yield: TRUE when range returned; Line 1739  Yield: TRUE when range returned;
1739  */  */
1740    
1741  static BOOL  static BOOL
1742  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1743      unsigned int *odptr)
1744  {  {
1745  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1746    
1747  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1748    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1749    
1750  if (c > d) return FALSE;  if (c > d) return FALSE;
1751    
# Line 1512  next = othercase + 1; Line 1754  next = othercase + 1;
1754    
1755  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1756    {    {
1757    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1758    next++;    next++;
1759    }    }
1760    
# Line 1526  return TRUE; Line 1766  return TRUE;
1766  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1767    
1768    
1769    
1770  /*************************************************  /*************************************************
1771  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1772  *************************************************/  *************************************************/
1773    
1774  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1775  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1776  bits.  sense to automatically possessify the repeated item.
1777    
1778  Arguments:  Arguments:
1779    optionsptr     pointer to the option bits    op_code       the repeated op code
1780    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1781    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1782    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1783    errorcodeptr   points to error code variable    ptr           next character in pattern
1784    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1785    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1786    
1787  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1788  */  */
1789    
1790  static BOOL  static BOOL
1791  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1792    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1793  {  {
1794  int repeat_type, op_type;  int next;
1795  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1796  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1797  int greedy_default, greedy_non_default;  
1798  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1799  int zeroreqbyte, zerofirstbyte;    {
1800  int req_caseopt, reqvary, tempreqvary;    for (;;)
1801  int condcount = 0;      {
1802  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1803  int after_manual_callout = 0;      if (*ptr == '#')
1804  register int c;        {
1805  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1806  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1807  BOOL inescq = FALSE;        }
1808  BOOL groupsetfirstbyte = FALSE;      else break;
1809  const uschar *ptr = *ptrptr;      }
1810  const uschar *tempptr;    }
1811  uschar *previous = NULL;  
1812  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1813  uschar classbits[32];  value is a character, a negative value is an escape value. */
1814    
1815    if (*ptr == '\\')
1816      {
1817      int temperrorcode = 0;
1818      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1819      if (temperrorcode != 0) return FALSE;
1820      ptr++;    /* Point after the escape sequence */
1821      }
1822    
1823    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1824      {
1825  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1826  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1827  #endif  #endif
1828      next = *ptr++;
1829      }
1830    
1831  /* Set up the default and non-default settings for greediness */  else return FALSE;
1832    
1833  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1834    
1835  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1836  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1837  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1838  find one.      {
1839        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1840        if (*ptr == '#')
1841          {
1842          while (*(++ptr) != 0)
1843            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1844          }
1845        else break;
1846        }
1847      }
1848    
1849  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1850    
1851  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1852      return FALSE;
1853    
1854  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1855  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1856  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1857  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1858    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1859    
1860  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1861    
1862  for (;; ptr++)  if (next >= 0) switch(op_code)
1863    {    {
1864    BOOL negate_class;    case OP_CHAR:
1865    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1866    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1867    int class_charcount;  #endif
1868    int class_lastchar;    return item != next;
   int newoptions;  
   int recno;  
   int skipbytes;  
   int subreqbyte;  
   int subfirstbyte;  
   int mclength;  
   uschar mcbuffer[8];  
   
   /* Next byte in the pattern */  
   
   c = *ptr;  
1869    
1870    /* If in \Q...\E, check for the end; if not, we have a literal */    /* For CHARNC (caseless character) we must check the other case. If we have
1871      Unicode property support, we can use it to test the other case of
1872      high-valued characters. */
1873    
1874    if (inescq && c != 0)    case OP_CHARNC:
1875    #ifdef SUPPORT_UTF8
1876      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1877    #endif
1878      if (item == next) return FALSE;
1879    #ifdef SUPPORT_UTF8
1880      if (utf8)
1881      {      {
1882      if (c == '\\' && ptr[1] == 'E')      unsigned int othercase;
1883        {      if (next < 128) othercase = cd->fcc[next]; else
1884    #ifdef SUPPORT_UCP
1885        othercase = _pcre_ucp_othercase((unsigned int)next);
1886    #else
1887        othercase = NOTACHAR;
1888    #endif
1889        return (unsigned int)item != othercase;
1890        }
1891      else
1892    #endif  /* SUPPORT_UTF8 */
1893      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1894    
1895      /* For OP_NOT, "item" must be a single-byte character. */
1896    
1897      case OP_NOT:
1898      if (next < 0) return FALSE;  /* Not a character */
1899      if (item == next) return TRUE;
1900      if ((options & PCRE_CASELESS) == 0) return FALSE;
1901    #ifdef SUPPORT_UTF8
1902      if (utf8)
1903        {
1904        unsigned int othercase;
1905        if (next < 128) othercase = cd->fcc[next]; else
1906    #ifdef SUPPORT_UCP
1907        othercase = _pcre_ucp_othercase(next);
1908    #else
1909        othercase = NOTACHAR;
1910    #endif
1911        return (unsigned int)item == othercase;
1912        }
1913      else
1914    #endif  /* SUPPORT_UTF8 */
1915      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1916    
1917      case OP_DIGIT:
1918      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1919    
1920      case OP_NOT_DIGIT:
1921      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1922    
1923      case OP_WHITESPACE:
1924      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1925    
1926      case OP_NOT_WHITESPACE:
1927      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1928    
1929      case OP_WORDCHAR:
1930      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1931    
1932      case OP_NOT_WORDCHAR:
1933      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1934    
1935      default:
1936      return FALSE;
1937      }
1938    
1939    
1940    /* Handle the case when the next item is \d, \s, etc. */
1941    
1942    switch(op_code)
1943      {
1944      case OP_CHAR:
1945      case OP_CHARNC:
1946    #ifdef SUPPORT_UTF8
1947      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1948    #endif
1949      switch(-next)
1950        {
1951        case ESC_d:
1952        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1953    
1954        case ESC_D:
1955        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1956    
1957        case ESC_s:
1958        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1959    
1960        case ESC_S:
1961        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1962    
1963        case ESC_w:
1964        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1965    
1966        case ESC_W:
1967        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1968    
1969        default:
1970        return FALSE;
1971        }
1972    
1973      case OP_DIGIT:
1974      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1975    
1976      case OP_NOT_DIGIT:
1977      return next == -ESC_d;
1978    
1979      case OP_WHITESPACE:
1980      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1981    
1982      case OP_NOT_WHITESPACE:
1983      return next == -ESC_s;
1984    
1985      case OP_WORDCHAR:
1986      return next == -ESC_W || next == -ESC_s;
1987    
1988      case OP_NOT_WORDCHAR:
1989      return next == -ESC_w || next == -ESC_d;
1990    
1991      default:
1992      return FALSE;
1993      }
1994    
1995    /* Control does not reach here */
1996    }
1997    
1998    
1999    
2000    /*************************************************
2001    *           Compile one branch                   *
2002    *************************************************/
2003    
2004    /* Scan the pattern, compiling it into the a vector. If the options are
2005    changed during the branch, the pointer is used to change the external options
2006    bits. This function is used during the pre-compile phase when we are trying
2007    to find out the amount of memory needed, as well as during the real compile
2008    phase. The value of lengthptr distinguishes the two phases.
2009    
2010    Arguments:
2011      optionsptr     pointer to the option bits
2012      codeptr        points to the pointer to the current code point
2013      ptrptr         points to the current pattern pointer
2014      errorcodeptr   points to error code variable
2015      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2016      reqbyteptr     set to the last literal character required, else < 0
2017      bcptr          points to current branch chain
2018      cd             contains pointers to tables etc.
2019      lengthptr      NULL during the real compile phase
2020                     points to length accumulator during pre-compile phase
2021    
2022    Returns:         TRUE on success
2023                     FALSE, with *errorcodeptr set non-zero on error
2024    */
2025    
2026    static BOOL
2027    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2028      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2029      compile_data *cd, int *lengthptr)
2030    {
2031    int repeat_type, op_type;
2032    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2033    int bravalue = 0;
2034    int greedy_default, greedy_non_default;
2035    int firstbyte, reqbyte;
2036    int zeroreqbyte, zerofirstbyte;
2037    int req_caseopt, reqvary, tempreqvary;
2038    int options = *optionsptr;
2039    int after_manual_callout = 0;
2040    int length_prevgroup = 0;
2041    register int c;
2042    register uschar *code = *codeptr;
2043    uschar *last_code = code;
2044    uschar *orig_code = code;
2045    uschar *tempcode;
2046    BOOL inescq = FALSE;
2047    BOOL groupsetfirstbyte = FALSE;
2048    const uschar *ptr = *ptrptr;
2049    const uschar *tempptr;
2050    uschar *previous = NULL;
2051    uschar *previous_callout = NULL;
2052    uschar *save_hwm = NULL;
2053    uschar classbits[32];
2054    
2055    #ifdef SUPPORT_UTF8
2056    BOOL class_utf8;
2057    BOOL utf8 = (options & PCRE_UTF8) != 0;
2058    uschar *class_utf8data;
2059    uschar utf8_char[6];
2060    #else
2061    BOOL utf8 = FALSE;
2062    uschar *utf8_char = NULL;
2063    #endif
2064    
2065    #ifdef DEBUG
2066    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2067    #endif
2068    
2069    /* Set up the default and non-default settings for greediness */
2070    
2071    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2072    greedy_non_default = greedy_default ^ 1;
2073    
2074    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2075    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2076    matches a non-fixed char first char; reqbyte just remains unset if we never
2077    find one.
2078    
2079    When we hit a repeat whose minimum is zero, we may have to adjust these values
2080    to take the zero repeat into account. This is implemented by setting them to
2081    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2082    item types that can be repeated set these backoff variables appropriately. */
2083    
2084    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2085    
2086    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2087    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2088    value > 255. It is added into the firstbyte or reqbyte variables to record the
2089    case status of the value. This is used only for ASCII characters. */
2090    
2091    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2092    
2093    /* Switch on next character until the end of the branch */
2094    
2095    for (;; ptr++)
2096      {
2097      BOOL negate_class;
2098      BOOL possessive_quantifier;
2099      BOOL is_quantifier;
2100      BOOL is_recurse;
2101      int class_charcount;
2102      int class_lastchar;
2103      int newoptions;
2104      int recno;
2105      int refsign;
2106      int skipbytes;
2107      int subreqbyte;
2108      int subfirstbyte;
2109      int terminator;
2110      int mclength;
2111      uschar mcbuffer[8];
2112    
2113      /* Get next byte in the pattern */
2114    
2115      c = *ptr;
2116    
2117      /* If we are in the pre-compile phase, accumulate the length used for the
2118      previous cycle of this loop. */
2119    
2120      if (lengthptr != NULL)
2121        {
2122    #ifdef DEBUG
2123        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2124    #endif
2125        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2126          {
2127          *errorcodeptr = ERR52;
2128          goto FAILED;
2129          }
2130    
2131        /* There is at least one situation where code goes backwards: this is the
2132        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2133        the class is simply eliminated. However, it is created first, so we have to
2134        allow memory for it. Therefore, don't ever reduce the length at this point.
2135        */
2136    
2137        if (code < last_code) code = last_code;
2138        *lengthptr += code - last_code;
2139        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2140    
2141        /* If "previous" is set and it is not at the start of the work space, move
2142        it back to there, in order to avoid filling up the work space. Otherwise,
2143        if "previous" is NULL, reset the current code pointer to the start. */
2144    
2145        if (previous != NULL)
2146          {
2147          if (previous > orig_code)
2148            {
2149            memmove(orig_code, previous, code - previous);
2150            code -= previous - orig_code;
2151            previous = orig_code;
2152            }
2153          }
2154        else code = orig_code;
2155    
2156        /* Remember where this code item starts so we can pick up the length
2157        next time round. */
2158    
2159        last_code = code;
2160        }
2161    
2162      /* In the real compile phase, just check the workspace used by the forward
2163      reference list. */
2164    
2165      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2166        {
2167        *errorcodeptr = ERR52;
2168        goto FAILED;
2169        }
2170    
2171      /* If in \Q...\E, check for the end; if not, we have a literal */
2172    
2173      if (inescq && c != 0)
2174        {
2175        if (c == '\\' && ptr[1] == 'E')
2176          {
2177        inescq = FALSE;        inescq = FALSE;
2178        ptr++;        ptr++;
2179        continue;        continue;
# Line 1643  for (;; ptr++) Line 2182  for (;; ptr++)
2182        {        {
2183        if (previous_callout != NULL)        if (previous_callout != NULL)
2184          {          {
2185          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2186              complete_callout(previous_callout, ptr, cd);
2187          previous_callout = NULL;          previous_callout = NULL;
2188          }          }
2189        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1664  for (;; ptr++) Line 2204  for (;; ptr++)
2204    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2205         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2206      {      {
2207      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2208          complete_callout(previous_callout, ptr, cd);
2209      previous_callout = NULL;      previous_callout = NULL;
2210      }      }
2211    
# Line 1675  for (;; ptr++) Line 2216  for (;; ptr++)
2216      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2217      if (c == '#')      if (c == '#')
2218        {        {
2219        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2220        on the Macintosh. */          {
2221        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2222        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2223          if (*ptr != 0) continue;
2224    
2225          /* Else fall through to handle end of string */
2226          c = 0;
2227        }        }
2228      }      }
2229    
# Line 1692  for (;; ptr++) Line 2237  for (;; ptr++)
2237    
2238    switch(c)    switch(c)
2239      {      {
2240      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2241        case 0:                        /* The branch terminates at string end */
2242      case 0:      case '|':                      /* or | or ) */
     case '|':  
2243      case ')':      case ')':
2244      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2245      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2246      *codeptr = code;      *codeptr = code;
2247      *ptrptr = ptr;      *ptrptr = ptr;
2248        if (lengthptr != NULL)
2249          {
2250          *lengthptr += code - last_code;   /* To include callout length */
2251          DPRINTF((">> end branch\n"));
2252          }
2253      return TRUE;      return TRUE;
2254    
2255    
2256        /* ===================================================================*/
2257      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2258      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2259    
# Line 1731  for (;; ptr++) Line 2282  for (;; ptr++)
2282      *code++ = OP_ANY;      *code++ = OP_ANY;
2283      break;      break;
2284    
2285      /* Character classes. If the included characters are all < 255 in value, we  
2286      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2287      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2288      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2289      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2290        map as usual, then invert it at the end. However, we use a different opcode
2291        so that data characters > 255 can be handled correctly.
2292    
2293      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2294      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1769  for (;; ptr++) Line 2322  for (;; ptr++)
2322        }        }
2323    
2324      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2325      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2326      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2327    
2328      class_charcount = 0;      class_charcount = 0;
2329      class_lastchar = -1;      class_lastchar = -1;
2330    
2331        /* Initialize the 32-char bit map to all zeros. We build the map in a
2332        temporary bit of memory, in case the class contains only 1 character (less
2333        than 256), because in that case the compiled code doesn't use the bit map.
2334        */
2335    
2336        memset(classbits, 0, 32 * sizeof(uschar));
2337    
2338  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2339      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2340      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2341  #endif  #endif
2342    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2343      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2344      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2345      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2346    
2347      do      if (c != 0) do
2348        {        {
2349          const uschar *oldptr;
2350    
2351  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2352        if (utf8 && c > 127)        if (utf8 && c > 127)
2353          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1806  for (;; ptr++) Line 2359  for (;; ptr++)
2359    
2360        if (inescq)        if (inescq)
2361          {          {
2362          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2363            {            {
2364            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2365            ptr++;            ptr++;                            /* Skip the 'E' */
2366            continue;            continue;                         /* Carry on with next */
2367            }            }
2368          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2369          }          }
2370    
2371        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1826  for (;; ptr++) Line 2379  for (;; ptr++)
2379            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2380          {          {
2381          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2382          int posix_class, i;          int posix_class, taboffset, tabopt;
2383          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2384            uschar pbits[32];
2385    
2386          if (ptr[1] != ':')          if (ptr[1] != ':')
2387            {            {
# Line 1856  for (;; ptr++) Line 2410  for (;; ptr++)
2410          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2411            posix_class = 0;            posix_class = 0;
2412    
2413          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2414          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2415          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2416          white space chars afterwards. */          result into the bit map that is being built. */
2417    
2418          posix_class *= 3;          posix_class *= 3;
2419          for (i = 0; i < 3; i++)  
2420            /* Copy in the first table (always present) */
2421    
2422            memcpy(pbits, cbits + posix_class_maps[posix_class],
2423              32 * sizeof(uschar));
2424    
2425            /* If there is a second table, add or remove it as required. */
2426    
2427            taboffset = posix_class_maps[posix_class + 1];
2428            tabopt = posix_class_maps[posix_class + 2];
2429    
2430            if (taboffset >= 0)
2431            {            {
2432            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2433            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2434            else            else
2435              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2436            }            }
2437    
2438            /* Not see if we need to remove any special characters. An option
2439            value of 1 removes vertical space and 2 removes underscore. */
2440    
2441            if (tabopt < 0) tabopt = -tabopt;
2442            if (tabopt == 1) pbits[1] &= ~0x3c;
2443              else if (tabopt == 2) pbits[11] &= 0x7f;
2444    
2445            /* Add the POSIX table or its complement into the main table that is
2446            being built and we are done. */
2447    
2448            if (local_negate)
2449              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2450            else
2451              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2452    
2453          ptr = tempptr + 1;          ptr = tempptr + 1;
2454          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2455          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2456          }          }
2457    
2458        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2459        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2460        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2461        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2462        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2463        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2464    
2465        if (c == '\\')        if (c == '\\')
2466          {          {
2467          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2468            if (*errorcodeptr != 0) goto FAILED;
2469    
2470          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2471          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2472            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2473          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2474            {            {
2475            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1915  for (;; ptr++) Line 2484  for (;; ptr++)
2484            {            {
2485            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2486            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2487            switch (-c)  
2488              /* Save time by not doing this in the pre-compile phase. */
2489    
2490              if (lengthptr == NULL) switch (-c)
2491              {              {
2492              case ESC_d:              case ESC_d:
2493              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1943  for (;; ptr++) Line 2515  for (;; ptr++)
2515              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2516              continue;              continue;
2517    
2518                case ESC_E: /* Perl ignores an orphan \E */
2519                continue;
2520    
2521                default:    /* Not recognized; fall through */
2522                break;      /* Need "default" setting to stop compiler warning. */
2523                }
2524    
2525              /* In the pre-compile phase, just do the recognition. */
2526    
2527              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2528                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2529    
2530              /* We need to deal with \P and \p in both phases. */
2531    
2532  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2533              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2534              case ESC_P:              {
2535                {              BOOL negated;
2536                BOOL negated;              int pdata;
2537                int property = get_ucp(&ptr, &negated, errorcodeptr);              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2538                if (property < 0) goto FAILED;              if (ptype < 0) goto FAILED;
2539                class_utf8 = TRUE;              class_utf8 = TRUE;
2540                *class_utf8data++ = ((-c == ESC_p) != negated)?              *class_utf8data++ = ((-c == ESC_p) != negated)?
2541                  XCL_PROP : XCL_NOTPROP;                XCL_PROP : XCL_NOTPROP;
2542                *class_utf8data++ = property;              *class_utf8data++ = ptype;
2543                class_charcount -= 2;   /* Not a < 256 character */              *class_utf8data++ = pdata;
2544                }              class_charcount -= 2;   /* Not a < 256 character */
2545              continue;              continue;
2546                }
2547  #endif  #endif
2548              /* Unrecognized escapes are faulted if PCRE is running in its
2549              strict mode. By default, for compatibility with Perl, they are
2550              treated as literals. */
2551    
2552              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2553              strict mode. By default, for compatibility with Perl, they are              {
2554              treated as literals. */              *errorcodeptr = ERR7;
2555                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2556              }              }
2557    
2558              class_charcount -= 2;  /* Undo the default count from above */
2559              c = *ptr;              /* Get the final character and fall through */
2560            }            }
2561    
2562          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2563          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2564    
2565          }   /* End of backslash handling */          }   /* End of backslash handling */
2566    
2567        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2568        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2569        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2570          entirely. The code for handling \Q and \E is messy. */
2571    
2572          CHECK_RANGE:
2573          while (ptr[1] == '\\' && ptr[2] == 'E')
2574            {
2575            inescq = FALSE;
2576            ptr += 2;
2577            }
2578    
2579          oldptr = ptr;
2580    
2581        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2582          {          {
2583          int d;          int d;
2584          ptr += 2;          ptr += 2;
2585            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2586    
2587            /* If we hit \Q (not followed by \E) at this point, go into escaped
2588            mode. */
2589    
2590            while (*ptr == '\\' && ptr[1] == 'Q')
2591              {
2592              ptr += 2;
2593              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2594              inescq = TRUE;
2595              break;
2596              }
2597    
2598            if (*ptr == 0 || (!inescq && *ptr == ']'))
2599              {
2600              ptr = oldptr;
2601              goto LONE_SINGLE_CHARACTER;
2602              }
2603    
2604  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2605          if (utf8)          if (utf8)
# Line 2001  for (;; ptr++) Line 2614  for (;; ptr++)
2614          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2615          in such circumstances. */          in such circumstances. */
2616    
2617          if (d == '\\')          if (!inescq && d == '\\')
2618            {            {
2619            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2620            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2621    
2622            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2623            was literal */            special means the '-' was literal */
2624    
2625            if (d < 0)            if (d < 0)
2626              {              {
2627              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2628              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2629                else if (d == -ESC_R) d = 'R'; else
2630                {                {
2631                ptr = oldptr - 2;                ptr = oldptr;
2632                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2633                }                }
2634              }              }
2635            }            }
2636    
2637          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2638          the pre-pass. Optimize one-character ranges */          one-character ranges */
2639    
2640            if (d < c)
2641              {
2642              *errorcodeptr = ERR8;
2643              goto FAILED;
2644              }
2645    
2646          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2647    
# Line 2042  for (;; ptr++) Line 2662  for (;; ptr++)
2662  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2663            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2664              {              {
2665              int occ, ocd;              unsigned int occ, ocd;
2666              int cc = c;              unsigned int cc = c;
2667              int origd = d;              unsigned int origd = d;
2668              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2669                {                {
2670                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2102  for (;; ptr++) Line 2722  for (;; ptr++)
2722          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2723          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2724    
2725          for (; c <= d; c++)          class_charcount += d - c + 1;
2726            class_lastchar = d;
2727    
2728            /* We can save a bit of time by skipping this in the pre-compile. */
2729    
2730            if (lengthptr == NULL) for (; c <= d; c++)
2731            {            {
2732            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2733            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2110  for (;; ptr++) Line 2735  for (;; ptr++)
2735              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2736              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2737              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2738            }            }
2739    
2740          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2135  for (;; ptr++) Line 2758  for (;; ptr++)
2758  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2759          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2760            {            {
2761            int chartype;            unsigned int othercase;
2762            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
2763              {              {
2764              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2765              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2163  for (;; ptr++) Line 2784  for (;; ptr++)
2784          }          }
2785        }        }
2786    
2787      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2788    
2789      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2790    
2791        if (c == 0)                          /* Missing terminating ']' */
2792          {
2793          *errorcodeptr = ERR6;
2794          goto FAILED;
2795          }
2796    
2797      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2798      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2230  for (;; ptr++) Line 2856  for (;; ptr++)
2856    
2857      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2858      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2859      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2860    
2861  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2862      if (class_utf8)      if (class_utf8)
# Line 2240  for (;; ptr++) Line 2866  for (;; ptr++)
2866        code += LINK_SIZE;        code += LINK_SIZE;
2867        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2868    
2869        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2870        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2871    
2872        if (class_charcount > 0)        if (class_charcount > 0)
2873          {          {
2874          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2875            memmove(code + 32, code, class_utf8data - code);
2876          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2877          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2878          }          }
2879          else code = class_utf8data;
2880    
2881        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2882    
# Line 2274  for (;; ptr++) Line 2893  for (;; ptr++)
2893      if (negate_class)      if (negate_class)
2894        {        {
2895        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2896        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2897            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2898        }        }
2899      else      else
2900        {        {
# Line 2284  for (;; ptr++) Line 2904  for (;; ptr++)
2904      code += 32;      code += 32;
2905      break;      break;
2906    
2907    
2908        /* ===================================================================*/
2909      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2910      has been tested above. */      has been tested above. */
2911    
# Line 2351  for (;; ptr++) Line 2973  for (;; ptr++)
2973        }        }
2974      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2975    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2976      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2977      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2978      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2398  for (;; ptr++) Line 3006  for (;; ptr++)
3006          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3007          }          }
3008    
3009          /* If the repetition is unlimited, it pays to see if the next thing on
3010          the line is something that cannot possibly match this character. If so,
3011          automatically possessifying this item gains some performance in the case
3012          where the match fails. */
3013    
3014          if (!possessive_quantifier &&
3015              repeat_max < 0 &&
3016              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3017                options, cd))
3018            {
3019            repeat_type = 0;    /* Force greedy */
3020            possessive_quantifier = TRUE;
3021            }
3022    
3023        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3024        }        }
3025    
3026      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3027      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3028      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3029      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3030        currently used only for single-byte chars. */
3031    
3032      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3033        {        {
3034        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3035        c = previous[1];        c = previous[1];
3036          if (!possessive_quantifier &&
3037              repeat_max < 0 &&
3038              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3039            {
3040            repeat_type = 0;    /* Force greedy */
3041            possessive_quantifier = TRUE;
3042            }
3043        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3044        }        }
3045    
# Line 2423  for (;; ptr++) Line 3053  for (;; ptr++)
3053      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3054        {        {
3055        uschar *oldcode;        uschar *oldcode;
3056        int prop_type;        int prop_type, prop_value;
3057        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3058        c = *previous;        c = *previous;
3059    
3060          if (!possessive_quantifier &&
3061              repeat_max < 0 &&
3062              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3063            {
3064            repeat_type = 0;    /* Force greedy */
3065            possessive_quantifier = TRUE;
3066            }
3067    
3068        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3069        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3070          previous[1] : -1;          {
3071            prop_type = previous[1];
3072            prop_value = previous[2];
3073            }
3074          else prop_type = prop_value = -1;
3075    
3076        oldcode = code;        oldcode = code;
3077        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2463  for (;; ptr++) Line 3105  for (;; ptr++)
3105          }          }
3106    
3107        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3108        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3109        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3110        one less than the maximum. */        one less than the maximum. */
3111    
# Line 2490  for (;; ptr++) Line 3132  for (;; ptr++)
3132    
3133          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3134          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3135          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3136          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3137          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3138    
# Line 2506  for (;; ptr++) Line 3148  for (;; ptr++)
3148  #endif  #endif
3149              {              {
3150              *code++ = c;              *code++ = c;
3151              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3152                  {
3153                  *code++ = prop_type;
3154                  *code++ = prop_value;
3155                  }
3156              }              }
3157            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3158            }            }
3159    
3160          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3161          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3162            UPTO is just for 1 instance, we can use QUERY instead. */
3163    
3164          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3165            {            {
# Line 2525  for (;; ptr++) Line 3172  for (;; ptr++)
3172            else            else
3173  #endif  #endif
3174            *code++ = c;            *code++ = c;
3175            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3176                {
3177                *code++ = prop_type;
3178                *code++ = prop_value;
3179                }
3180            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3181            *code++ = OP_UPTO + repeat_type;  
3182            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3183                {
3184                *code++ = OP_QUERY + repeat_type;
3185                }
3186              else
3187                {
3188                *code++ = OP_UPTO + repeat_type;
3189                PUT2INC(code, 0, repeat_max);
3190                }
3191            }            }
3192          }          }
3193    
# Line 2544  for (;; ptr++) Line 3203  for (;; ptr++)
3203  #endif  #endif
3204        *code++ = c;        *code++ = c;
3205    
3206        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3207        defines the required property. */        define the required property. */
3208    
3209  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3210        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3211            {
3212            *code++ = prop_type;
3213            *code++ = prop_value;
3214            }
3215  #endif  #endif
3216        }        }
3217    
# Line 2591  for (;; ptr++) Line 3254  for (;; ptr++)
3254      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3255      cases. */      cases. */
3256    
3257      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3258               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3259        {        {
3260        register int i;        register int i;
3261        int ketoffset = 0;        int ketoffset = 0;
3262        int len = code - previous;        int len = code - previous;
3263        uschar *bralink = NULL;        uschar *bralink = NULL;
3264    
3265          /* Repeating a DEFINE group is pointless */
3266    
3267          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3268            {
3269            *errorcodeptr = ERR55;
3270            goto FAILED;
3271            }
3272    
3273          /* This is a paranoid check to stop integer overflow later on */
3274    
3275          if (len > MAX_DUPLENGTH)
3276            {
3277            *errorcodeptr = ERR50;
3278            goto FAILED;
3279            }
3280    
3281        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3282        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3283        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2633  for (;; ptr++) Line 3312  for (;; ptr++)
3312          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3313          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3314          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3315          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3316          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3317            doing this. */
3318    
3319          if (repeat_max <= 1)          if (repeat_max <= 1)
3320            {            {
3321            *code = OP_END;            *code = OP_END;
3322            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3323            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3324            code++;            code++;
3325            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3337  for (;; ptr++)
3337            {            {
3338            int offset;            int offset;
3339            *code = OP_END;            *code = OP_END;
3340            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3341            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3342            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3343            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2677  for (;; ptr++) Line 3357  for (;; ptr++)
3357        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3358        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3359        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3360        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3361          forward reference subroutine calls in the group, there will be entries on
3362          the workspace list; replicate these with an appropriate increment. */
3363    
3364        else        else
3365          {          {
3366          if (repeat_min > 1)          if (repeat_min > 1)
3367            {            {
3368            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3369            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3370    
3371              if (lengthptr != NULL)
3372                *lengthptr += (repeat_min - 1)*length_prevgroup;
3373    
3374              /* This is compiling for real */
3375    
3376              else
3377              {              {
3378              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3379              code += len;              for (i = 1; i < repeat_min; i++)
3380                  {
3381                  uschar *hc;
3382                  uschar *this_hwm = cd->hwm;
3383                  memcpy(code, previous, len);
3384                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3385                    {
3386                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3387                    cd->hwm += LINK_SIZE;
3388                    }
3389                  save_hwm = this_hwm;
3390                  code += len;
3391                  }
3392              }              }
3393            }            }
3394    
3395          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3396          }          }
3397    
# Line 2697  for (;; ptr++) Line 3399  for (;; ptr++)
3399        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3400        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3401        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3402        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3403          replicate entries on the forward reference list. */
3404    
3405        if (repeat_max >= 0)        if (repeat_max >= 0)
3406          {          {
3407          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3408            just adjust the length as if we had. For each repetition we must add 1
3409            to the length for BRAZERO and for all but the last repetition we must
3410            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3411    
3412            if (lengthptr != NULL && repeat_max > 0)
3413              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3414                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3415    
3416            /* This is compiling for real */
3417    
3418            else for (i = repeat_max - 1; i >= 0; i--)
3419            {            {
3420              uschar *hc;
3421              uschar *this_hwm = cd->hwm;
3422    
3423            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3424    
3425            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2718  for (;; ptr++) Line 3435  for (;; ptr++)
3435              }              }
3436    
3437            memcpy(code, previous, len);            memcpy(code, previous, len);
3438              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3439                {
3440                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3441                cd->hwm += LINK_SIZE;
3442                }
3443              save_hwm = this_hwm;
3444            code += len;            code += len;
3445            }            }
3446    
# Line 2740  for (;; ptr++) Line 3463  for (;; ptr++)
3463        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3464        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3465        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3466        correct offset was computed above. */        correct offset was computed above.
3467    
3468        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3469          this group is a non-atomic one that could match an empty string. If so,
3470          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3471          that runtime checking can be done. [This check is also applied to
3472          atomic groups at runtime, but in a different way.] */
3473    
3474          else
3475            {
3476            uschar *ketcode = code - ketoffset;
3477            uschar *bracode = ketcode - GET(ketcode, 1);
3478            *ketcode = OP_KETRMAX + repeat_type;
3479            if (lengthptr == NULL && *bracode != OP_ONCE)
3480              {
3481              uschar *scode = bracode;
3482              do
3483                {
3484                if (could_be_empty_branch(scode, ketcode, utf8))
3485                  {
3486                  *bracode += OP_SBRA - OP_BRA;
3487                  break;
3488                  }
3489                scode += GET(scode, 1);
3490                }
3491              while (*scode == OP_ALT);
3492              }
3493            }
3494        }        }
3495    
3496      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2753  for (;; ptr++) Line 3501  for (;; ptr++)
3501        goto FAILED;        goto FAILED;
3502        }        }
3503    
3504      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3505      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3506      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3507      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3508      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3509        but the special opcodes can optimize it a bit. The repeated item starts at
3510        tempcode, not at previous, which might be the first part of a string whose
3511        (former) last char we repeated.
3512    
3513        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3514        an 'upto' may follow. We skip over an 'exact' item, and then test the
3515        length of what remains before proceeding. */
3516    
3517      if (possessive_quantifier)      if (possessive_quantifier)
3518        {        {
3519        int len = code - tempcode;        int len;
3520        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3521        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3522        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3523        tempcode[0] = OP_ONCE;        len = code - tempcode;
3524        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3525        PUTINC(code, 0, len);          {
3526        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3527            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3528            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3529            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3530    
3531            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3532            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3533            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3534            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3535    
3536            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3537            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3538            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3539            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3540    
3541            default:
3542            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3543            code += 1 + LINK_SIZE;
3544            len += 1 + LINK_SIZE;
3545            tempcode[0] = OP_ONCE;
3546            *code++ = OP_KET;
3547            PUTINC(code, 0, len);
3548            PUT(tempcode, 1, len);
3549            break;
3550            }
3551        }        }
3552    
3553      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2781  for (;; ptr++) Line 3560  for (;; ptr++)
3560      break;      break;
3561    
3562    
3563      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3564      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3565      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3566      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3567      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3568      check for syntax errors here.  */      group. */
3569    
3570      case '(':      case '(':
3571      newoptions = options;      newoptions = options;
3572      skipbytes = 0;      skipbytes = 0;
3573        bravalue = OP_CBRA;
3574        save_hwm = cd->hwm;
3575    
3576      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3577        {        {
3578        int set, unset;        int i, set, unset, namelen;
3579        int *optset;        int *optset;
3580          const uschar *name;
3581          uschar *slot;
3582    
3583        switch (*(++ptr))        switch (*(++ptr))
3584          {          {
3585          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3586          ptr++;          ptr++;
3587          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3588            if (*ptr == 0)
3589              {
3590              *errorcodeptr = ERR18;
3591              goto FAILED;
3592              }
3593          continue;          continue;
3594    
3595          case ':':                 /* Non-extracting bracket */  
3596            /* ------------------------------------------------------------ */
3597            case ':':                 /* Non-capturing bracket */
3598          bravalue = OP_BRA;          bravalue = OP_BRA;
3599          ptr++;          ptr++;
3600          break;          break;
3601    
3602    
3603            /* ------------------------------------------------------------ */
3604          case '(':          case '(':
3605          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3606    
3607          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3608            group), a name (referring to a named group), or 'R', referring to
3609            recursion. R<digits> and R&name are also permitted for recursion tests.
3610    
3611            There are several syntaxes for testing a named group: (?(name)) is used
3612            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3613    
3614            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3615            be the recursive thing or the name 'R' (and similarly for 'R' followed
3616            by digits), and (b) a number could be a name that consists of digits.
3617            In both cases, we look for a name first; if not found, we try the other
3618            cases. */
3619    
3620            /* For conditions that are assertions, check the syntax, and then exit
3621            the switch. This will take control down to where bracketed groups,
3622            including assertions, are processed. */
3623    
3624            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3625              break;
3626    
3627            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3628            below), and all need to skip 3 bytes at the start of the group. */
3629    
3630            code[1+LINK_SIZE] = OP_CREF;
3631            skipbytes = 3;
3632            refsign = -1;
3633    
3634            /* Check for a test for recursion in a named group. */
3635    
3636            if (ptr[1] == 'R' && ptr[2] == '&')
3637              {
3638              terminator = -1;
3639              ptr += 2;
3640              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3641              }
3642    
3643            /* Check for a test for a named group's having been set, using the Perl
3644            syntax (?(<name>) or (?('name') */
3645    
3646            else if (ptr[1] == '<')
3647              {
3648              terminator = '>';
3649              ptr++;
3650              }
3651            else if (ptr[1] == '\'')
3652              {
3653              terminator = '\'';
3654              ptr++;
3655              }
3656            else
3657              {
3658              terminator = 0;
3659              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3660              }
3661    
3662            /* We now expect to read a name; any thing else is an error */
3663    
3664            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3665              {
3666              ptr += 1;  /* To get the right offset */
3667              *errorcodeptr = ERR28;
3668              goto FAILED;
3669              }
3670    
3671          if (ptr[1] == 'R')          /* Read the name, but also get it as a number if it's all digits */
3672    
3673            recno = 0;
3674            name = ++ptr;
3675            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3676              {
3677              if (recno >= 0)
3678                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3679                  recno * 10 + *ptr - '0' : -1;
3680              ptr++;
3681              }
3682            namelen = ptr - name;
3683    
3684            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3685            {            {
3686            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3687            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3688            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3689            }            }
3690    
3691          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
3692          if a digit follows ( then there will just be digits until ) because  
3693          the syntax was checked in the first pass. */          if (lengthptr != NULL) break;
3694    
3695          else if ((digitab[ptr[1]] && ctype_digit) != 0)          /* In the real compile we do the work of looking for the actual
3696            reference. If the string started with "+" or "-" we require the rest to
3697            be digits, in which case recno will be set. */
3698    
3699            if (refsign > 0)
3700            {            {
3701            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3702              {              {
3703              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3704              goto FAILED;              goto FAILED;
3705                }
3706              if (refsign == '-')
3707                {
3708                recno = cd->bracount - recno + 1;
3709                if (recno <= 0)
3710                  {
3711                  *errorcodeptr = ERR15;
3712                  goto FAILED;
3713                  }
3714              }              }
3715            ptr++;            else recno += cd->bracount;
3716            code[1+LINK_SIZE] = OP_CREF;            PUT2(code, 2+LINK_SIZE, recno);
3717            PUT2(code, 2+LINK_SIZE, condref);            break;
3718            skipbytes = 3;            }
3719    
3720            /* Otherwise (did not start with "+" or "-"), start by looking for the
3721            name. */
3722    
3723            slot = cd->name_table;
3724            for (i = 0; i < cd->names_found; i++)
3725              {
3726              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3727              slot += cd->name_entry_size;
3728              }
3729    
3730            /* Found a previous named subpattern */
3731    
3732            if (i < cd->names_found)
3733              {
3734              recno = GET2(slot, 0);
3735              PUT2(code, 2+LINK_SIZE, recno);
3736              }
3737    
3738            /* Search the pattern for a forward reference */
3739    
3740            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3741                            (options & PCRE_EXTENDED) != 0)) > 0)
3742              {
3743              PUT2(code, 2+LINK_SIZE, i);
3744              }
3745    
3746            /* If terminator == 0 it means that the name followed directly after
3747            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3748            some further alternatives to try. For the cases where terminator != 0
3749            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3750            now checked all the possibilities, so give an error. */
3751    
3752            else if (terminator != 0)
3753              {
3754              *errorcodeptr = ERR15;
3755              goto FAILED;
3756              }
3757    
3758            /* Check for (?(R) for recursion. Allow digits after R to specify a
3759            specific group number. */
3760    
3761            else if (*name == 'R')
3762              {
3763              recno = 0;
3764              for (i = 1; i < namelen; i++)
3765                {
3766                if ((digitab[name[i]] & ctype_digit) == 0)
3767                  {
3768                  *errorcodeptr = ERR15;
3769                  goto FAILED;
3770                  }
3771                recno = recno * 10 + name[i] - '0';
3772                }
3773              if (recno == 0) recno = RREF_ANY;
3774              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3775              PUT2(code, 2+LINK_SIZE, recno);
3776              }
3777    
3778            /* Similarly, check for the (?(DEFINE) "condition", which is always
3779            false. */
3780    
3781            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3782              {
3783              code[1+LINK_SIZE] = OP_DEF;
3784              skipbytes = 1;
3785              }
3786    
3787            /* Check for the "name" actually being a subpattern number. */
3788    
3789            else if (recno > 0)
3790              {
3791              PUT2(code, 2+LINK_SIZE, recno);
3792              }
3793    
3794            /* Either an unidentified subpattern, or a reference to (?(0) */
3795    
3796            else
3797              {
3798              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3799              goto FAILED;
3800            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3801          break;          break;
3802    
3803    
3804            /* ------------------------------------------------------------ */
3805          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3806          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3807          ptr++;          ptr++;
3808          break;          break;
3809    
3810    
3811            /* ------------------------------------------------------------ */
3812          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3813          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3814          ptr++;          ptr++;
3815          break;          break;
3816    
3817          case '<':                 /* Lookbehinds */  
3818          switch (*(++ptr))          /* ------------------------------------------------------------ */
3819            case '<':                 /* Lookbehind or named define */
3820            switch (ptr[1])
3821            {            {
3822            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3823            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3824            ptr++;            ptr += 2;
3825            break;            break;
3826    
3827            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3828            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3829            ptr++;            ptr += 2;
3830            break;            break;
3831    
3832              default:                /* Could be name define, else bad */
3833              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3834              ptr++;                  /* Correct offset for error */
3835              *errorcodeptr = ERR24;
3836              goto FAILED;
3837            }            }
3838          break;          break;
3839    
3840    
3841            /* ------------------------------------------------------------ */
3842          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3843          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3844          ptr++;          ptr++;
3845          break;          break;
3846    
3847    
3848            /* ------------------------------------------------------------ */
3849          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3850          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3851          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3852          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3853            {                       /* closing parenthesis is present. */            {
3854            int n = 0;            int n = 0;
3855            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3856              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3857              if (*ptr != ')')
3858                {
3859                *errorcodeptr = ERR39;
3860                goto FAILED;
3861                }
3862            if (n > 255)            if (n > 255)
3863              {              {
3864              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2896  for (;; ptr++) Line 3872  for (;; ptr++)
3872          previous = NULL;          previous = NULL;
3873          continue;          continue;
3874    
3875          case 'P':                 /* Named subpattern handling */  
3876          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3877            case 'P':                 /* Python-style named subpattern handling */
3878            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3879              {
3880              is_recurse = *ptr == '>';
3881              terminator = ')';
3882              goto NAMED_REF_OR_RECURSE;
3883              }
3884            else if (*ptr != '<')    /* Test for Python-style definition */
3885            {            {
3886            int i, namelen;            *errorcodeptr = ERR41;
3887            uschar *slot = cd->name_table;            goto FAILED;
3888            const uschar *name;     /* Don't amalgamate; some compilers */            }
3889            name = ++ptr;           /* grumble at autoincrement in declaration */          /* Fall through to handle (?P< as (?< is handled */
3890    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
3891    
3892            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
3893            DEFINE_NAME:    /* Come here from (?< handling */
3894            case '\'':
3895              {
3896              terminator = (*ptr == '<')? '>' : '\'';
3897              name = ++ptr;
3898    
3899              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3900              namelen = ptr - name;
3901    
3902              /* In the pre-compile phase, just do a syntax check. */
3903    
3904              if (lengthptr != NULL)
3905              {              {
3906              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
3907              if (crc == 0)                {
3908                  *errorcodeptr = ERR42;
3909                  goto FAILED;
3910                  }
3911                if (cd->names_found >= MAX_NAME_COUNT)
3912                  {
3913                  *errorcodeptr = ERR49;
3914                  goto FAILED;
3915                  }
3916                if (namelen + 3 > cd->name_entry_size)
3917                {                {
3918                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
3919                  if (namelen > MAX_NAME_SIZE)
3920                  {                  {
3921                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
3922                  goto FAILED;                  goto FAILED;
3923                  }                  }
               crc = -1;             /* Current name is substring */  
3924                }                }
3925              if (crc < 0)              }
3926    
3927              /* In the real compile, create the entry in the table */
3928    
3929              else
3930                {
3931                slot = cd->name_table;
3932                for (i = 0; i < cd->names_found; i++)
3933                {                {
3934                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
3935                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
3936                break;                  {
3937                    if (slot[2+namelen] == 0)
3938                      {
3939                      if ((options & PCRE_DUPNAMES) == 0)
3940                        {
3941                        *errorcodeptr = ERR43;
3942                        goto FAILED;
3943                        }
3944                      }
3945                    else crc = -1;      /* Current name is substring */
3946                    }
3947                  if (crc < 0)
3948                    {
3949                    memmove(slot + cd->name_entry_size, slot,
3950                      (cd->names_found - i) * cd->name_entry_size);
3951                    break;
3952                    }
3953                  slot += cd->name_entry_size;
3954                }                }
             slot += cd->name_entry_size;  
             }  
3955    
3956            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3957            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3958            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3959            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3960            }            }
3961    
3962          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
3963    
3964            ptr++;                    /* Move past > or ' */
3965            cd->names_found++;
3966            goto NUMBERED_GROUP;
3967    
3968    
3969            /* ------------------------------------------------------------ */
3970            case '&':                 /* Perl recursion/subroutine syntax */
3971            terminator = ')';
3972            is_recurse = TRUE;
3973            /* Fall through */
3974    
3975            /* We come here from the Python syntax above that handles both
3976            references (?P=name) and recursion (?P>name), as well as falling
3977            through from the Perl recursion syntax (?&name). */
3978    
3979            NAMED_REF_OR_RECURSE:
3980            name = ++ptr;
3981            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3982            namelen = ptr - name;
3983    
3984            /* In the pre-compile phase, do a syntax check and set a dummy
3985            reference number. */
3986    
3987            if (lengthptr != NULL)
3988            {            {
3989            int i, namelen;            if (*ptr != terminator)
3990            int type = *ptr++;              {
3991            const uschar *name = ptr;              *errorcodeptr = ERR42;
3992            uschar *slot = cd->name_table;              goto FAILED;
3993                }
3994              if (namelen > MAX_NAME_SIZE)
3995                {
3996                *errorcodeptr = ERR48;
3997                goto FAILED;
3998                }
3999              recno = 0;
4000              }
4001    
4002            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4003    
4004            else
4005              {
4006              slot = cd->name_table;
4007            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4008              {              {
4009              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4010              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4011              }              }
4012            if (i >= cd->names_found)  
4013              if (i < cd->names_found)         /* Back reference */
4014                {
4015                recno = GET2(slot, 0);
4016                }
4017              else if ((recno =                /* Forward back reference */
4018                        find_parens(ptr, cd->bracount, name, namelen,
4019                          (options & PCRE_EXTENDED) != 0)) <= 0)
4020              {              {
4021              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4022              goto FAILED;              goto FAILED;
4023              }              }
4024              }
4025    
4026            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4027            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4028    
4029            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4030            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4031    
         /* Should never happen */  
         break;  
4032    
4033          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4034            case 'R':                 /* Recursion */
4035          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4036          /* Fall through */          /* Fall through */
4037    
         /* Recursion or "subroutine" call */  
4038    
4039          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4040          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4041            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4042            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4043            {            {
4044            const uschar *called;            const uschar *called;
4045    
4046              if ((refsign = *ptr) == '+') ptr++;
4047              else if (refsign == '-')
4048                {
4049                if ((digitab[ptr[1]] & ctype_digit) == 0)
4050                  goto OTHER_CHAR_AFTER_QUERY;
4051                ptr++;
4052                }
4053    
4054            recno = 0;            recno = 0;
4055            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4056              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4057    
4058              if (*ptr != ')')
4059                {
4060                *errorcodeptr = ERR29;
4061                goto FAILED;
4062                }
4063    
4064              if (refsign == '-')
4065                {
4066                if (recno == 0)
4067                  {
4068                  *errorcodeptr = ERR58;
4069                  goto FAILED;
4070                  }
4071                recno = cd->bracount - recno + 1;
4072                if (recno <= 0)
4073                  {
4074                  *errorcodeptr = ERR15;
4075                  goto FAILED;
4076                  }
4077                }
4078              else if (refsign == '+')
4079                {
4080                if (recno == 0)
4081                  {
4082                  *errorcodeptr = ERR58;
4083                  goto FAILED;
4084                  }
4085                recno += cd->bracount;
4086                }
4087    
4088            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4089    
4090            HANDLE_RECURSION:            HANDLE_RECURSION:
4091    
4092            previous = code;            previous = code;
4093              called = cd->start_code;
4094    
4095            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4096            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4097              this point. If we end up with a forward reference, first check that
4098            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4099            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4100              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4101    
4102            if (called == NULL)            if (lengthptr == NULL)
4103              {              {
4104              *errorcodeptr = ERR15;              *code = OP_END;
4105              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4106    
4107            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4108    
4109            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4110              {                {
4111              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4112              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4113                    {
4114                    *errorcodeptr = ERR15;
4115                    goto FAILED;
4116                    }
4117                  called = cd->start_code + recno;
4118                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4119                  }
4120    
4121                /* If not a forward reference, and the subpattern is still open,
4122                this is a recursive call. We check to see if this is a left
4123                recursion that could loop for ever, and diagnose that case. */
4124    
4125                else if (GET(called, 1) == 0 &&
4126                         could_be_empty(called, code, bcptr, utf8))
4127                  {
4128                  *errorcodeptr = ERR40;
4129                  goto FAILED;
4130                  }
4131              }              }
4132    
4133            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4134              "once" brackets. Set up a "previous group" length so that a
4135              subsequent quantifier will work. */
4136    
4137              *code = OP_ONCE;
4138              PUT(code, 1, 2 + 2*LINK_SIZE);
4139              code += 1 + LINK_SIZE;
4140    
4141            *code = OP_RECURSE;            *code = OP_RECURSE;
4142            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4143            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4144    
4145              *code = OP_KET;
4146              PUT(code, 1, 2 + 2*LINK_SIZE);
4147              code += 1 + LINK_SIZE;
4148    
4149              length_prevgroup = 3 + 3*LINK_SIZE;
4150            }            }
4151    
4152            /* Can't determine a first byte now */
4153    
4154            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4155          continue;          continue;
4156    
         /* Character after (? not specially recognized */  
4157    
4158          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4159            default:              /* Other characters: check option setting */
4160            OTHER_CHAR_AFTER_QUERY:
4161          set = unset = 0;          set = unset = 0;
4162          optset = &set;          optset = &set;
4163    
# Line 3036  for (;; ptr++) Line 4167  for (;; ptr++)
4167              {              {
4168              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4169    
4170                case 'J':    /* Record that it changed in the external options */
4171                *optset |= PCRE_DUPNAMES;
4172                cd->external_options |= PCRE_JCHANGED;
4173                break;
4174    
4175              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4176              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4177              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4178              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4179              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4180              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4181    
4182                default:  *errorcodeptr = ERR12;
4183                          ptr--;    /* Correct the offset */
4184                          goto FAILED;
4185              }              }
4186            }            }
4187    
# Line 3050  for (;; ptr++) Line 4190  for (;; ptr++)
4190          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4191    
4192          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4193          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4194          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4195          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4196          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4197          a group), a resetting item can be compiled.          caseless checking of required bytes.
4198    
4199          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4200          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4201          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4202            that value after the start, because it gets reset as code is discarded
4203            during the pre-compile. However, this can happen only at top level - if
4204            we are within parentheses, the starting BRA will still be present. At
4205            any parenthesis level, the length value can be used to test if anything
4206            has been compiled at that level. Thus, a test for both these conditions
4207            is necessary to ensure we correctly detect the start of the pattern in
4208            both phases.
4209    
4210            If we are not at the pattern start, compile code to change the ims
4211            options if this setting actually changes any of them. We also pass the
4212            new setting back so that it can be put at the start of any following
4213            branches, and when this group ends (if we are in a group), a resetting
4214            item can be compiled. */
4215    
4216          if (*ptr == ')')          if (*ptr == ')')
4217            {            {
4218            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4219                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4220              {              {
4221              *code++ = OP_OPT;              cd->external_options = newoptions;
4222              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4223              }              }
4224             else
4225                {
4226                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4227                  {
4228                  *code++ = OP_OPT;
4229                  *code++ = newoptions & PCRE_IMS;
4230                  }
4231    
4232            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4233            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4234            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4235    
4236            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4237            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4238            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4239            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4240                }
4241    
4242            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4243            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3088  for (;; ptr++) Line 4250  for (;; ptr++)
4250    
4251          bravalue = OP_BRA;          bravalue = OP_BRA;
4252          ptr++;          ptr++;
4253          }          }     /* End of switch for character following (? */
4254        }        }       /* End of (? handling */
4255    
4256      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4257      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4258        brackets. */
4259    
4260      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4261        {        {
4262        bravalue = OP_BRA;        bravalue = OP_BRA;
4263        }        }
4264    
4265      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4266    
4267      else      else
4268        {        {
4269        NUMBERED_GROUP:        NUMBERED_GROUP:
4270        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4271          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4272          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4273        }        }
4274    
4275      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4276      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4277      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4278      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4279        they have changed. */
4280    
4281      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4282      *code = bravalue;      *code = bravalue;
4283      tempcode = code;      tempcode = code;
4284      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4285        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4286    
4287      if (!compile_regex(      if (!compile_regex(
4288           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4289           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4290           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4291           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4292           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4293           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4294            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4295           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4296           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4297           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4298           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4299           cd))                          /* Tables block */           cd,                           /* Tables block */
4300             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4301               &length_prevgroup           /* Pre-compile phase */
4302             ))
4303        goto FAILED;        goto FAILED;
4304    
4305      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3148  for (;; ptr++) Line 4308  for (;; ptr++)
4308      is on the bracket. */      is on the bracket. */
4309    
4310      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4311      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4312    
4313      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4314        {        {
4315        uschar *tc = code;        uschar *tc = code;
4316        condcount = 0;        int condcount = 0;
4317    
4318        do {        do {
4319           condcount++;           condcount++;
# Line 3161  for (;; ptr++) Line 4321  for (;; ptr++)
4321           }           }
4322        while (*tc != OP_KET);        while (*tc != OP_KET);
4323    
4324        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4325          false). It must have only one branch. */
4326    
4327          if (code[LINK_SIZE+1] == OP_DEF)
4328          {          {
4329          *errorcodeptr = ERR27;          if (condcount > 1)
4330          goto FAILED;            {
4331              *errorcodeptr = ERR54;
4332              goto FAILED;
4333              }
4334            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4335            }
4336    
4337          /* A "normal" conditional group. If there is just one branch, we must not
4338          make use of its firstbyte or reqbyte, because this is equivalent to an
4339          empty second branch. */
4340    
4341          else
4342            {
4343            if (condcount > 2)
4344              {
4345              *errorcodeptr = ERR27;
4346              goto FAILED;
4347              }
4348            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4349          }          }
4350          }
4351    
4352        /* If there is just one branch, we must not make use of its firstbyte or      /* Error if hit end of pattern */
4353        reqbyte, because this is equivalent to an empty second branch. */  
4354        if (*ptr != ')')
4355          {
4356          *errorcodeptr = ERR14;
4357          goto FAILED;
4358          }
4359    
4360        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4361        group, less the brackets at either end. Then reduce the compiled code to
4362        just the brackets so that it doesn't use much memory if it is duplicated by
4363        a quantifier. */
4364    
4365        if (lengthptr != NULL)
4366          {
4367          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4368          code++;
4369          PUTINC(code, 0, 1 + LINK_SIZE);
4370          *code++ = OP_KET;
4371          PUTINC(code, 0, 1 + LINK_SIZE);
4372        }        }
4373    
4374      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4375      brackets of all kinds, and conditions with two branches (see code above).  
4376      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4377      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4378      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4379        relevant. */
4380    
4381        if (bravalue == OP_DEF) break;
4382    
4383        /* Handle updating of the required and first characters for other types of
4384        group. Update for normal brackets of all kinds, and conditions with two
4385        branches (see code above). If the bracket is followed by a quantifier with
4386        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4387        zerofirstbyte outside the main loop so that they can be accessed for the
4388        back off. */
4389    
4390      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4391      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4392      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4393    
4394      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4395        {        {
4396        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4397        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3224  for (;; ptr++) Line 4432  for (;; ptr++)
4432      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4433    
4434      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4435        break;     /* End of processing '(' */
4436    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4437    
4438      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4439        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4440      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4441      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4442      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4443      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4444      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4445    
4446        case '\\':
4447        tempptr = ptr;
4448        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4449        if (*errorcodeptr != 0) goto FAILED;
4450    
4451      if (c < 0)      if (c < 0)
4452        {        {
4453        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3262  for (;; ptr++) Line 4457  for (;; ptr++)
4457          continue;          continue;
4458          }          }
4459    
4460          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4461    
4462        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4463        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4464    
# Line 3273  for (;; ptr++) Line 4470  for (;; ptr++)
4470        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4471        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4472    
4473        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4474    
4475          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4476            {
4477            is_recurse = FALSE;
4478            terminator = (*(++ptr) == '<')? '>' : '\'';
4479            goto NAMED_REF_OR_RECURSE;
4480            }
4481    
4482          /* Back references are handled specially; must disable firstbyte if
4483          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4484          ':' later. */
4485    
4486        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4487          {          {
4488          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4489    
4490            HANDLE_REFERENCE:    /* Come here from named backref handling */
4491            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4492          previous = code;          previous = code;
4493          *code++ = OP_REF;          *code++ = OP_REF;
4494          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4495            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4496            if (recno > cd->top_backref) cd->top_backref = recno;
4497          }          }
4498    
4499        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4500    
4501  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4502        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
4503          {          {
4504          BOOL negated;          BOOL negated;
4505          int value = get_ucp(&ptr, &negated, errorcodeptr);          int pdata;
4506            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4507            if (ptype < 0) goto FAILED;
4508          previous = code;          previous = code;
4509          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4510          *code++ = value;          *code++ = ptype;
4511            *code++ = pdata;
4512            }
4513    #else
4514    
4515          /* If Unicode properties are not supported, \X, \P, and \p are not
4516          allowed. */
4517    
4518          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4519            {
4520            *errorcodeptr = ERR45;
4521            goto FAILED;
4522          }          }
4523  #endif  #endif
4524    
4525        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4526        value */        can obtain the OP value by negating the escape value. */
4527    
4528        else        else
4529          {          {
# Line 3322  for (;; ptr++) Line 4547  for (;; ptr++)
4547       mcbuffer[0] = c;       mcbuffer[0] = c;
4548       mclength = 1;       mclength = 1;
4549       }       }
   
4550      goto ONE_CHAR;      goto ONE_CHAR;
4551    
4552    
4553        /* ===================================================================*/
4554      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4555      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4556      multi-byte literal character. */      multi-byte literal character. */
# Line 3335  for (;; ptr++) Line 4561  for (;; ptr++)
4561      mcbuffer[0] = c;      mcbuffer[0] = c;
4562    
4563  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4564      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4565        {        {
4566        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4567          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3386  for (;; ptr++) Line 4612  for (;; ptr++)
4612      }      }
4613    }                   /* end of big loop */    }                   /* end of big loop */
4614    
4615    
4616  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4617  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4618  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3402  return FALSE; Line 4629  return FALSE;
4629  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4630  *************************************************/  *************************************************/
4631    
4632  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4633  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4634  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4635  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4636  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4637  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4638  the new options into every subsequent branch compile.  into every subsequent branch compile.
4639    
4640    This function is used during the pre-compile phase when we are trying to find
4641    out the amount of memory needed, as well as during the real compile phase. The
4642    value of lengthptr distinguishes the two phases.
4643    
4644  Argument:  Argument:
4645    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4646    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4647    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4648    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4649    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4650    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4651    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4652    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4653    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4654    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4655    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4656      lengthptr      NULL during the real compile phase
4657                     points to length accumulator during pre-compile phase
4658    
4659  Returns:      TRUE on success  Returns:         TRUE on success
4660  */  */
4661    
4662  static BOOL  static BOOL
4663  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4664    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4665    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4666  {  {
4667  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4668  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3439  uschar *start_bracket = code; Line 4671  uschar *start_bracket = code;
4671  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4672  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4673  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4674    int length;
4675  branch_chain bc;  branch_chain bc;
4676    
4677  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3446  bc.current = code; Line 4679  bc.current = code;
4679    
4680  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4681    
4682    /* Accumulate the length for use in the pre-compile phase. Start with the
4683    length of the BRA and KET and any extra bytes that are required at the
4684    beginning. We accumulate in a local variable to save frequent testing of
4685    lenthptr for NULL. We cannot do this by looking at the value of code at the
4686    start and end of each alternative, because compiled items are discarded during
4687    the pre-compile phase so that the work space is not exceeded. */
4688    
4689    length = 2 + 2*LINK_SIZE + skipbytes;
4690    
4691    /* WARNING: If the above line is changed for any reason, you must also change
4692    the code that abstracts option settings at the start of the pattern and makes
4693    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4694    pre-compile phase to find out whether anything has yet been compiled or not. */
4695    
4696  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4697    
4698  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3461  for (;;) Line 4708  for (;;)
4708      {      {
4709      *code++ = OP_OPT;      *code++ = OP_OPT;
4710      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4711        length += 2;
4712      }      }
4713    
4714    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3470  for (;;) Line 4718  for (;;)
4718      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4719      reverse_count = code;      reverse_count = code;
4720      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4721        length += 1 + LINK_SIZE;
4722      }      }
4723    
4724    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4725      into the length. */
4726    
4727    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4728          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4729      {      {
4730      *ptrptr = ptr;      *ptrptr = ptr;
4731      return FALSE;      return FALSE;
4732      }      }
4733    
4734    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4735    
4736    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4737      {      {
4738      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4739      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4740    
4741    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4742    match the values from all the previous branches, except that if the previous        {
4743    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4744    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4745          }
4746    
4747    else      /* If this is not the first branch, the first char and reqbyte have to
4748      {      match the values from all the previous branches, except that if the
4749      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4750      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4751