/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 333 by ph10, Thu Apr 10 19:55:57 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 54  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 116  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 156  static const int posix_class_maps[] = { Line 213  static const int posix_class_maps[] = {
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number or name after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
287    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289    /* 50 */    /* 50 */
290    "repeated subpattern is too long",    "repeated subpattern is too long\0"    /** DEAD **/
291    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)\0"
292  };    "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+";
306    
307    
308  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 241  For convenience, we use the same bit def Line 321  For convenience, we use the same bit def
321    
322  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
325  static const unsigned char digitab[] =  static const unsigned char digitab[] =
326    {    {
327    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 357  static const unsigned char digitab[] =
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
361  static const unsigned char digitab[] =  static const unsigned char digitab[] =
362    {    {
363    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 371  static const unsigned char digitab[] =
371    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 405  static const unsigned char ebcdic_charta
405    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
406    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
407    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
408    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
409    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
410    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
411    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 432  static const unsigned char ebcdic_charta
432  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
433    
434  static BOOL  static BOOL
435    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
437    
438    
439    
440    /*************************************************
441    *            Find an error text                  *
442    *************************************************/
443    
444    /* The error texts are now all in one long string, to save on relocations. As
445    some of the text is of unknown length, we can't use a table of offsets.
446    Instead, just count through the strings. This is not a performance issue
447    because it happens only when there has been a compilation error.
448    
449    Argument:   the error number
450    Returns:    pointer to the error string
451    */
452    
453    static const char *
454    find_error_text(int n)
455    {
456    const char *s = error_texts;
457    for (; n > 0; n--) while (*s++ != 0);
458    return s;
459    }
460    
461    
462  /*************************************************  /*************************************************
# Line 363  static BOOL Line 465  static BOOL
465    
466  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
467  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
468  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
469  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471    ptr is pointing at the \. On exit, it is on the final character of the escape
472    sequence.
473    
474  Arguments:  Arguments:
475    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 376  Arguments: Line 480  Arguments:
480    
481  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
482                   negative => a special escape sequence                   negative => a special escape sequence
483                   on error, errorptr is set                   on error, errorcodeptr is set
484  */  */
485    
486  static int  static int
# Line 394  ptr--; /* Set Line 498  ptr--; /* Set
498    
499  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
500    
501  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
503  Otherwise further processing may be required. */  Otherwise further processing may be required. */
504    
505  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
506  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
507  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
508    
509  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
510  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
511  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
512  #endif  #endif
513    
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 516  else if ((i = escapes[c - 0x48]) != 0)
516  else  else
517    {    {
518    const uschar *oldptr;    const uschar *oldptr;
519      BOOL braced, negated;
520    
521    switch (c)    switch (c)
522      {      {
523      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 531  else
531      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
532      break;      break;
533    
534        /* \g must be followed by one of a number of specific things:
535    
536        (1) A number, either plain or braced. If positive, it is an absolute
537        backreference. If negative, it is a relative backreference. This is a Perl
538        5.10 feature.
539    
540        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
541        is part of Perl's movement towards a unified syntax for back references. As
542        this is synonymous with \k{name}, we fudge it up by pretending it really
543        was \k.
544    
545        (3) For Oniguruma compatibility we also support \g followed by a name or a
546        number either in angle brackets or in single quotes. However, these are
547        (possibly recursive) subroutine calls, _not_ backreferences. Just return
548        the -ESC_g code (cf \k). */
549    
550        case 'g':
551        if (ptr[1] == '<' || ptr[1] == '\'')
552          {
553          c = -ESC_g;
554          break;
555          }
556    
557        /* Handle the Perl-compatible cases */
558    
559        if (ptr[1] == '{')
560          {
561          const uschar *p;
562          for (p = ptr+2; *p != 0 && *p != '}'; p++)
563            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
564          if (*p != 0 && *p != '}')
565            {
566            c = -ESC_k;
567            break;
568            }
569          braced = TRUE;
570          ptr++;
571          }
572        else braced = FALSE;
573    
574        if (ptr[1] == '-')
575          {
576          negated = TRUE;
577          ptr++;
578          }
579        else negated = FALSE;
580    
581        c = 0;
582        while ((digitab[ptr[1]] & ctype_digit) != 0)
583          c = c * 10 + *(++ptr) - '0';
584    
585        if (c < 0)   /* Integer overflow */
586          {
587          *errorcodeptr = ERR61;
588          break;
589          }
590    
591        if (braced && *(++ptr) != '}')
592          {
593          *errorcodeptr = ERR57;
594          break;
595          }
596    
597        if (c == 0)
598          {
599          *errorcodeptr = ERR58;
600          break;
601          }
602    
603        if (negated)
604          {
605          if (c > bracount)
606            {
607            *errorcodeptr = ERR15;
608            break;
609            }
610          c = bracount - (c - 1);
611          }
612    
613        c = -(ESC_REF + c);
614        break;
615    
616      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
617      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
618      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 446  else Line 634  else
634        c -= '0';        c -= '0';
635        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
636          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
637          if (c < 0)    /* Integer overflow */
638            {
639            *errorcodeptr = ERR61;
640            break;
641            }
642        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
643          {          {
644          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 495  else Line 688  else
688          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
689          count++;          count++;
690    
691  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
692          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
693          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
694  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
695          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
696          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
697  #endif  #endif
# Line 522  else Line 715  else
715        {        {
716        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
717        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
718  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
719        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
720        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
721  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
722        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
723        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
724  #endif  #endif
725        }        }
726      break;      break;
727    
728      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
729        This coding is ASCII-specific, but then the whole concept of \cx is
730        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
731    
732      case 'c':      case 'c':
733      c = *(++ptr);      c = *(++ptr);
734      if (c == 0)      if (c == 0)
735        {        {
736        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
737        return 0;        break;
738        }        }
739    
740      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
741      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
742      c ^= 0x40;      c ^= 0x40;
743  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
744      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
745      c ^= 0xC0;      c ^= 0xC0;
746  #endif  #endif
747      break;      break;
748    
749      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
750      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
751      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
752      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
753      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
754    
755      default:      default:
756      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 619  if (c == '{') Line 810  if (c == '{')
810      *negptr = TRUE;      *negptr = TRUE;
811      ptr++;      ptr++;
812      }      }
813    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
814      {      {
815      c = *(++ptr);      c = *(++ptr);
816      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 648  top = _pcre_utt_size; Line 839  top = _pcre_utt_size;
839  while (bot < top)  while (bot < top)
840    {    {
841    i = (bot + top) >> 1;    i = (bot + top) >> 1;
842    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
843    if (c == 0)    if (c == 0)
844      {      {
845      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 772  return p; Line 963  return p;
963    
964    
965  /*************************************************  /*************************************************
966  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
967  *************************************************/  *************************************************/
968    
969  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
970  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
971  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
972  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
973  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
974    be terminated by '>' because that is checked in the first pass.
975    
976  Arguments:  Arguments:
977    pointer      current position in the pattern    ptr          current position in the pattern
978    count        current count of capturing parens    count        current count of capturing parens so far encountered
979    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
980    namelen      name length    lorn         name length, or subpattern number if name is NULL
981      xmode        TRUE if we are in /x mode
982    
983  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
984  */  */
985    
986  static int  static int
987  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
988      BOOL xmode)
989  {  {
990  const uschar *thisname;  const uschar *thisname;
991    
992  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
993    {    {
994    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
995    
996      /* Skip over backslashed characters and also entire \Q...\E */
997    
998      if (*ptr == '\\')
999        {
1000        if (*(++ptr) == 0) return -1;
1001        if (*ptr == 'Q') for (;;)
1002          {
1003          while (*(++ptr) != 0 && *ptr != '\\');
1004          if (*ptr == 0) return -1;
1005          if (*(++ptr) == 'E') break;
1006          }
1007        continue;
1008        }
1009    
1010      /* Skip over character classes */
1011    
1012      if (*ptr == '[')
1013        {
1014        while (*(++ptr) != ']')
1015          {
1016          if (*ptr == 0) return -1;
1017          if (*ptr == '\\')
1018            {
1019            if (*(++ptr) == 0) return -1;
1020            if (*ptr == 'Q') for (;;)
1021              {
1022              while (*(++ptr) != 0 && *ptr != '\\');
1023              if (*ptr == 0) return -1;
1024              if (*(++ptr) == 'E') break;
1025              }
1026            continue;
1027            }
1028          }
1029        continue;
1030        }
1031    
1032      /* Skip comments in /x mode */
1033    
1034      if (xmode && *ptr == '#')
1035        {
1036        while (*(++ptr) != 0 && *ptr != '\n');
1037        if (*ptr == 0) return -1;
1038        continue;
1039        }
1040    
1041      /* An opening parens must now be a real metacharacter */
1042    
1043    if (*ptr != '(') continue;    if (*ptr != '(') continue;
1044    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?' && ptr[1] != '*')
1045    if (ptr[2] == '(') { ptr += 2; continue; }      {
1046    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
1047        if (name == NULL && count == lorn) return count;
1048        continue;
1049        }
1050    
1051      ptr += 2;
1052      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1053    
1054      /* We have to disambiguate (?<! and (?<= from (?<name> */
1055    
1056      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1057           *ptr != '\'')
1058        continue;
1059    
1060    count++;    count++;
1061    ptr += 4;  
1062      if (name == NULL && count == lorn) return count;
1063      term = *ptr++;
1064      if (term == '<') term = '>';
1065    thisname = ptr;    thisname = ptr;
1066    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
1067    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
1068          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1069      return count;      return count;
1070    }    }
1071    
1072  return -1;  return -1;
1073  }  }
1074    
# Line 862  for (;;) Line 1123  for (;;)
1123    
1124      case OP_CALLOUT:      case OP_CALLOUT:
1125      case OP_CREF:      case OP_CREF:
1126      case OP_BRANUMBER:      case OP_RREF:
1127        case OP_DEF:
1128      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1129      break;      break;
1130    
# Line 907  for (;;) Line 1169  for (;;)
1169    {    {
1170    int d;    int d;
1171    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1172    switch (op)    switch (op)
1173      {      {
1174        case OP_CBRA:
1175      case OP_BRA:      case OP_BRA:
1176      case OP_ONCE:      case OP_ONCE:
1177      case OP_COND:      case OP_COND:
1178      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1179      if (d < 0) return d;      if (d < 0) return d;
1180      branchlength += d;      branchlength += d;
1181      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1210  for (;;)
1210      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1211    
1212      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1213      case OP_CREF:      case OP_CREF:
1214        case OP_RREF:
1215        case OP_DEF:
1216      case OP_OPT:      case OP_OPT:
1217      case OP_CALLOUT:      case OP_CALLOUT:
1218      case OP_SOD:      case OP_SOD:
# Line 995  for (;;) Line 1257  for (;;)
1257    
1258      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1259      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1260        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1261      cc += 4;      cc += 4;
1262      break;      break;
1263    
# Line 1094  for (;;) Line 1357  for (;;)
1357    
1358    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1359    
1360    /* Handle bracketed group */    /* Handle capturing bracket */
1361    
1362    else if (c > OP_BRA)    else if (c == OP_CBRA)
1363      {      {
1364      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1365      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1366      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1367      }      }
1368    
1369    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we can get the item's length from the table, except that for
1370    that are followed by a character may be followed by a multi-byte character.    repeated character types, we have to test for \p and \P, which have an extra
1371    The length in the table is a minimum, so we have to scan along to skip the    two bytes of parameters. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1372    
1373    else    else
1374      {      {
1375        switch(c)
1376          {
1377          case OP_TYPESTAR:
1378          case OP_TYPEMINSTAR:
1379          case OP_TYPEPLUS:
1380          case OP_TYPEMINPLUS:
1381          case OP_TYPEQUERY:
1382          case OP_TYPEMINQUERY:
1383          case OP_TYPEPOSSTAR:
1384          case OP_TYPEPOSPLUS:
1385          case OP_TYPEPOSQUERY:
1386          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1387          break;
1388    
1389          case OP_TYPEUPTO:
1390          case OP_TYPEMINUPTO:
1391          case OP_TYPEEXACT:
1392          case OP_TYPEPOSUPTO:
1393          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1394          break;
1395          }
1396    
1397        /* Add in the fixed length from the table */
1398    
1399      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1400    
1401      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1402      a multi-byte character. The length in the table is a minimum, so we have to
1403      arrange to skip the extra bytes. */
1404    
1405    #ifdef SUPPORT_UTF8
1406      if (utf8) switch(c)      if (utf8) switch(c)
1407        {        {
1408        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1410  for (;;)
1410        case OP_EXACT:        case OP_EXACT:
1411        case OP_UPTO:        case OP_UPTO:
1412        case OP_MINUPTO:        case OP_MINUPTO:
1413          case OP_POSUPTO:
1414        case OP_STAR:        case OP_STAR:
1415        case OP_MINSTAR:        case OP_MINSTAR:
1416          case OP_POSSTAR:
1417        case OP_PLUS:        case OP_PLUS:
1418        case OP_MINPLUS:        case OP_MINPLUS:
1419          case OP_POSPLUS:
1420        case OP_QUERY:        case OP_QUERY:
1421        case OP_MINQUERY:        case OP_MINQUERY:
1422        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1423          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1424        break;        break;
1425        }        }
1426    #endif
1427      }      }
1428    }    }
1429  }  }
# Line 1164  for (;;) Line 1459  for (;;)
1459    
1460    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1461    
1462    /* All bracketed groups have the same length. */    /* Otherwise, we can get the item's length from the table, except that for
1463      repeated character types, we have to test for \p and \P, which have an extra
1464      two bytes of parameters. */
1465    
1466    else if (c > OP_BRA)    else
1467      {      {
1468      code += _pcre_OP_lengths[OP_BRA];      switch(c)
1469      }        {
1470          case OP_TYPESTAR:
1471          case OP_TYPEMINSTAR:
1472          case OP_TYPEPLUS:
1473          case OP_TYPEMINPLUS:
1474          case OP_TYPEQUERY:
1475          case OP_TYPEMINQUERY:
1476          case OP_TYPEPOSSTAR:
1477          case OP_TYPEPOSPLUS:
1478          case OP_TYPEPOSQUERY:
1479          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1480          break;
1481    
1482          case OP_TYPEPOSUPTO:
1483          case OP_TYPEUPTO:
1484          case OP_TYPEMINUPTO:
1485          case OP_TYPEEXACT:
1486          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1487          break;
1488          }
1489    
1490    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes      /* Add in the fixed length from the table */
   that are followed by a character may be followed by a multi-byte character.  
   The length in the table is a minimum, so we have to scan along to skip the  
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1491    
   else  
     {  
1492      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1493    
1494        /* In UTF-8 mode, opcodes that are followed by a character may be followed
1495        by a multi-byte character. The length in the table is a minimum, so we have
1496        to arrange to skip the extra bytes. */
1497    
1498    #ifdef SUPPORT_UTF8
1499      if (utf8) switch(c)      if (utf8) switch(c)
1500        {        {
1501        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1503  for (;;)
1503        case OP_EXACT:        case OP_EXACT:
1504        case OP_UPTO:        case OP_UPTO:
1505        case OP_MINUPTO:        case OP_MINUPTO:
1506          case OP_POSUPTO:
1507        case OP_STAR:        case OP_STAR:
1508        case OP_MINSTAR:        case OP_MINSTAR:
1509          case OP_POSSTAR:
1510        case OP_PLUS:        case OP_PLUS:
1511        case OP_MINPLUS:        case OP_MINPLUS:
1512          case OP_POSPLUS:
1513        case OP_QUERY:        case OP_QUERY:
1514        case OP_MINQUERY:        case OP_MINQUERY:
1515        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1516          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1517        break;        break;
1518        }        }
1519    #endif
1520      }      }
1521    }    }
1522  }  }
# Line 1207  for (;;) Line 1528  for (;;)
1528  *************************************************/  *************************************************/
1529    
1530  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1531  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1532  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1533  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1534  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1535    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1536    bracket whose current branch will already have been scanned.
1537    
1538  Arguments:  Arguments:
1539    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1547  static BOOL
1547  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1548  {  {
1549  register int c;  register int c;
1550  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1551       code < endcode;       code < endcode;
1552       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1553    {    {
# Line 1232  for (code = first_significant_code(code Line 1555  for (code = first_significant_code(code
1555    
1556    c = *code;    c = *code;
1557    
1558    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1559      first_significant_code() with a TRUE final argument. */
1560    
1561      if (c == OP_ASSERT)
1562        {
1563        do code += GET(code, 1); while (*code == OP_ALT);
1564        c = *code;
1565        continue;
1566        }
1567    
1568      /* Groups with zero repeats can of course be empty; skip them. */
1569    
1570      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1571        {
1572        code += _pcre_OP_lengths[c];
1573        do code += GET(code, 1); while (*code == OP_ALT);
1574        c = *code;
1575        continue;
1576        }
1577    
1578      /* For other groups, scan the branches. */
1579    
1580      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1581      {      {
1582      BOOL empty_branch;      BOOL empty_branch;
1583      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1593  for (code = first_significant_code(code
1593        }        }
1594      while (*code == OP_ALT);      while (*code == OP_ALT);
1595      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1596      c = *code;      c = *code;
1597        continue;
1598      }      }
1599    
1600    else switch (c)    /* Handle the other opcodes */
1601    
1602      switch (c)
1603      {      {
1604      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1605        cannot be represented just by a bit map. This includes negated single
1606        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1607        actual length is stored in the compiled code, so we must update "code"
1608        here. */
1609    
1610  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1611      case OP_XCLASS:      case OP_XCLASS:
1612      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1613      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1614  #endif  #endif
1615    
# Line 1308  for (code = first_significant_code(code Line 1659  for (code = first_significant_code(code
1659      case OP_NOT:      case OP_NOT:
1660      case OP_PLUS:      case OP_PLUS:
1661      case OP_MINPLUS:      case OP_MINPLUS:
1662        case OP_POSPLUS:
1663      case OP_EXACT:      case OP_EXACT:
1664      case OP_NOTPLUS:      case OP_NOTPLUS:
1665      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1666        case OP_NOTPOSPLUS:
1667      case OP_NOTEXACT:      case OP_NOTEXACT:
1668      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1669      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1670        case OP_TYPEPOSPLUS:
1671      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1672      return FALSE;      return FALSE;
1673    
1674        /* These are going to continue, as they may be empty, but we have to
1675        fudge the length for the \p and \P cases. */
1676    
1677        case OP_TYPESTAR:
1678        case OP_TYPEMINSTAR:
1679        case OP_TYPEPOSSTAR:
1680        case OP_TYPEQUERY:
1681        case OP_TYPEMINQUERY:
1682        case OP_TYPEPOSQUERY:
1683        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1684        break;
1685    
1686        /* Same for these */
1687    
1688        case OP_TYPEUPTO:
1689        case OP_TYPEMINUPTO:
1690        case OP_TYPEPOSUPTO:
1691        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1692        break;
1693    
1694      /* End of branch */      /* End of branch */
1695    
1696      case OP_KET:      case OP_KET:
# Line 1325  for (code = first_significant_code(code Line 1699  for (code = first_significant_code(code
1699      case OP_ALT:      case OP_ALT:
1700      return TRUE;      return TRUE;
1701    
1702      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1703      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1704    
1705  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1706      case OP_STAR:      case OP_STAR:
1707      case OP_MINSTAR:      case OP_MINSTAR:
1708        case OP_POSSTAR:
1709      case OP_QUERY:      case OP_QUERY:
1710      case OP_MINQUERY:      case OP_MINQUERY:
1711        case OP_POSQUERY:
1712      case OP_UPTO:      case OP_UPTO:
1713      case OP_MINUPTO:      case OP_MINUPTO:
1714        case OP_POSUPTO:
1715      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1716      break;      break;
1717  #endif  #endif
# Line 1383  return TRUE; Line 1760  return TRUE;
1760  *************************************************/  *************************************************/
1761    
1762  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1763  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1764  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1765  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1766    
1767    Originally, this function only recognized a sequence of letters between the
1768    terminators, but it seems that Perl recognizes any sequence of characters,
1769    though of course unknown POSIX names are subsequently rejected. Perl gives an
1770    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1771    didn't consider this to be a POSIX class. Likewise for [:1234:].
1772    
1773    The problem in trying to be exactly like Perl is in the handling of escapes. We
1774    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1775    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1776    below handles the special case of \], but does not try to do any other escape
1777    processing. This makes it different from Perl for cases such as [:l\ower:]
1778    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1779    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1780    I think.
1781    
1782  Argument:  Arguments:
1783    ptr      pointer to the initial [    ptr      pointer to the initial [
1784    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1785    
1786  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1787  */  */
1788    
1789  static BOOL  static BOOL
1790  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1791  {  {
1792  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1793  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1794  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1795    {    {
1796    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1797    return TRUE;      {
1798        if (*ptr == ']') return FALSE;
1799        if (*ptr == terminator && ptr[1] == ']')
1800          {
1801          *endptr = ptr;
1802          return TRUE;
1803          }
1804        }
1805    }    }
1806  return FALSE;  return FALSE;
1807  }  }
# Line 1430  Returns: a value representing the na Line 1826  Returns: a value representing the na
1826  static int  static int
1827  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1828  {  {
1829    const char *pn = posix_names;
1830  register int yield = 0;  register int yield = 0;
1831  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1832    {    {
1833    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1834      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1835      pn += posix_name_lengths[yield] + 1;
1836    yield++;    yield++;
1837    }    }
1838  return -1;  return -1;
# Line 1452  earlier groups that are outside the curr Line 1850  earlier groups that are outside the curr
1850  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1851  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1852  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1853  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1854  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1855    
1856    This function has been extended with the possibility of forward references for
1857    recursions and subroutine calls. It must also check the list of such references
1858    for the group we are dealing with. If it finds that one of the recursions in
1859    the current group is on this list, it adjusts the offset in the list, not the
1860    value in the reference (which is a group number).
1861    
1862  Arguments:  Arguments:
1863    group      points to the start of the group    group      points to the start of the group
1864    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1865    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1866    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1867      save_hwm   the hwm forward reference pointer at the start of the group
1868    
1869  Returns:     nothing  Returns:     nothing
1870  */  */
1871    
1872  static void  static void
1873  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1874      uschar *save_hwm)
1875  {  {
1876  uschar *ptr = group;  uschar *ptr = group;
1877    
1878  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1879    {    {
1880    int offset = GET(ptr, 1);    int offset;
1881    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1882    
1883      /* See if this recursion is on the forward reference list. If so, adjust the
1884      reference. */
1885    
1886      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1887        {
1888        offset = GET(hc, 0);
1889        if (cd->start_code + offset == ptr + 1)
1890          {
1891          PUT(hc, 0, offset + adjust);
1892          break;
1893          }
1894        }
1895    
1896      /* Otherwise, adjust the recursion offset if it's after the start of this
1897      group. */
1898    
1899      if (hc >= cd->hwm)
1900        {
1901        offset = GET(ptr, 1);
1902        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1903        }
1904    
1905    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1906    }    }
1907  }  }
# Line 1550  Yield: TRUE when range returned; Line 1980  Yield: TRUE when range returned;
1980  */  */
1981    
1982  static BOOL  static BOOL
1983  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1984      unsigned int *odptr)
1985  {  {
1986  int c, othercase, next;  unsigned int c, othercase, next;
1987    
1988  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1989    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1990    
1991  if (c > d) return FALSE;  if (c > d) return FALSE;
1992    
# Line 1576  return TRUE; Line 2007  return TRUE;
2007  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2008    
2009    
2010    
2011  /*************************************************  /*************************************************
2012  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2013  *************************************************/  *************************************************/
2014    
2015  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2016  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2017  bits.  sense to automatically possessify the repeated item.
2018    
2019  Arguments:  Arguments:
2020    optionsptr     pointer to the option bits    op_code       the repeated op code
2021    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2022    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2023    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2024    errorcodeptr   points to error code variable    ptr           next character in pattern
2025    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2026    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
2027    
2028  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
2029  */  */
2030    
2031  static BOOL  static BOOL
2032  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2033    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
2034  {  {
2035  int repeat_type, op_type;  int next;
2036  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
2037  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
2038  int greedy_default, greedy_non_default;  
2039  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
2040  int zeroreqbyte, zerofirstbyte;    {
2041  int req_caseopt, reqvary, tempreqvary;    for (;;)
2042  int options = *optionsptr;      {
2043  int after_manual_callout = 0;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2044  register int c;      if (*ptr == '#')
2045  register uschar *code = *codeptr;        {
2046  uschar *tempcode;        while (*(++ptr) != 0)
2047  BOOL inescq = FALSE;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2048  BOOL groupsetfirstbyte = FALSE;        }
2049  const uschar *ptr = *ptrptr;      else break;
2050  const uschar *tempptr;      }
2051  uschar *previous = NULL;    }
2052  uschar *previous_callout = NULL;  
2053  uschar classbits[32];  /* If the next item is one that we can handle, get its value. A non-negative
2054    value is a character, a negative value is an escape value. */
2055    
2056    if (*ptr == '\\')
2057      {
2058      int temperrorcode = 0;
2059      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2060      if (temperrorcode != 0) return FALSE;
2061      ptr++;    /* Point after the escape sequence */
2062      }
2063    
2064    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2065      {
2066  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2067  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
2068  #endif  #endif
2069      next = *ptr++;
2070      }
2071    
2072  /* Set up the default and non-default settings for greediness */  else return FALSE;
2073    
2074  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
2075    
2076  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
2077  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
2078  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
2079  find one.      {
2080        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2081        if (*ptr == '#')
2082          {
2083          while (*(++ptr) != 0)
2084            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2085          }
2086        else break;
2087        }
2088      }
2089    
2090  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
2091    
2092  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2093      return FALSE;
2094    
2095  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
2096  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
2097  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
2098  case status of the value. This is used only for ASCII characters. */  utf8_char. */
2099    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
2100    
2101  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
2102    
2103  for (;; ptr++)  if (next >= 0) switch(op_code)
2104    {    {
2105    BOOL negate_class;    case OP_CHAR:
2106    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
2107    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2108    int class_charcount;  #endif
2109    int class_lastchar;    return item != next;
2110    int newoptions;  
2111      /* For CHARNC (caseless character) we must check the other case. If we have
2112      Unicode property support, we can use it to test the other case of
2113      high-valued characters. */
2114    
2115      case OP_CHARNC:
2116    #ifdef SUPPORT_UTF8
2117      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2118    #endif
2119      if (item == next) return FALSE;
2120    #ifdef SUPPORT_UTF8
2121      if (utf8)
2122        {
2123        unsigned int othercase;
2124        if (next < 128) othercase = cd->fcc[next]; else
2125    #ifdef SUPPORT_UCP
2126        othercase = _pcre_ucp_othercase((unsigned int)next);
2127    #else
2128        othercase = NOTACHAR;
2129    #endif
2130        return (unsigned int)item != othercase;
2131        }
2132      else
2133    #endif  /* SUPPORT_UTF8 */
2134      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2135    
2136      /* For OP_NOT, "item" must be a single-byte character. */
2137    
2138      case OP_NOT:
2139      if (item == next) return TRUE;
2140      if ((options & PCRE_CASELESS) == 0) return FALSE;
2141    #ifdef SUPPORT_UTF8
2142      if (utf8)
2143        {
2144        unsigned int othercase;
2145        if (next < 128) othercase = cd->fcc[next]; else
2146    #ifdef SUPPORT_UCP
2147        othercase = _pcre_ucp_othercase(next);
2148    #else
2149        othercase = NOTACHAR;
2150    #endif
2151        return (unsigned int)item == othercase;
2152        }
2153      else
2154    #endif  /* SUPPORT_UTF8 */
2155      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2156    
2157      case OP_DIGIT:
2158      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2159    
2160      case OP_NOT_DIGIT:
2161      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2162    
2163      case OP_WHITESPACE:
2164      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2165    
2166      case OP_NOT_WHITESPACE:
2167      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2168    
2169      case OP_WORDCHAR:
2170      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2171    
2172      case OP_NOT_WORDCHAR:
2173      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2174    
2175      case OP_HSPACE:
2176      case OP_NOT_HSPACE:
2177      switch(next)
2178        {
2179        case 0x09:
2180        case 0x20:
2181        case 0xa0:
2182        case 0x1680:
2183        case 0x180e:
2184        case 0x2000:
2185        case 0x2001:
2186        case 0x2002:
2187        case 0x2003:
2188        case 0x2004:
2189        case 0x2005:
2190        case 0x2006:
2191        case 0x2007:
2192        case 0x2008:
2193        case 0x2009:
2194        case 0x200A:
2195        case 0x202f:
2196        case 0x205f:
2197        case 0x3000:
2198        return op_code != OP_HSPACE;
2199        default:
2200        return op_code == OP_HSPACE;
2201        }
2202    
2203      case OP_VSPACE:
2204      case OP_NOT_VSPACE:
2205      switch(next)
2206        {
2207        case 0x0a:
2208        case 0x0b:
2209        case 0x0c:
2210        case 0x0d:
2211        case 0x85:
2212        case 0x2028:
2213        case 0x2029:
2214        return op_code != OP_VSPACE;
2215        default:
2216        return op_code == OP_VSPACE;
2217        }
2218    
2219      default:
2220      return FALSE;
2221      }
2222    
2223    
2224    /* Handle the case when the next item is \d, \s, etc. */
2225    
2226    switch(op_code)
2227      {
2228      case OP_CHAR:
2229      case OP_CHARNC:
2230    #ifdef SUPPORT_UTF8
2231      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2232    #endif
2233      switch(-next)
2234        {
2235        case ESC_d:
2236        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2237    
2238        case ESC_D:
2239        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2240    
2241        case ESC_s:
2242        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2243    
2244        case ESC_S:
2245        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2246    
2247        case ESC_w:
2248        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2249    
2250        case ESC_W:
2251        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2252    
2253        case ESC_h:
2254        case ESC_H:
2255        switch(item)
2256          {
2257          case 0x09:
2258          case 0x20:
2259          case 0xa0:
2260          case 0x1680:
2261          case 0x180e:
2262          case 0x2000:
2263          case 0x2001:
2264          case 0x2002:
2265          case 0x2003:
2266          case 0x2004:
2267          case 0x2005:
2268          case 0x2006:
2269          case 0x2007:
2270          case 0x2008:
2271          case 0x2009:
2272          case 0x200A:
2273          case 0x202f:
2274          case 0x205f:
2275          case 0x3000:
2276          return -next != ESC_h;
2277          default:
2278          return -next == ESC_h;
2279          }
2280    
2281        case ESC_v:
2282        case ESC_V:
2283        switch(item)
2284          {
2285          case 0x0a:
2286          case 0x0b:
2287          case 0x0c:
2288          case 0x0d:
2289          case 0x85:
2290          case 0x2028:
2291          case 0x2029:
2292          return -next != ESC_v;
2293          default:
2294          return -next == ESC_v;
2295          }
2296    
2297        default:
2298        return FALSE;
2299        }
2300    
2301      case OP_DIGIT:
2302      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2303             next == -ESC_h || next == -ESC_v;
2304    
2305      case OP_NOT_DIGIT:
2306      return next == -ESC_d;
2307    
2308      case OP_WHITESPACE:
2309      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2310    
2311      case OP_NOT_WHITESPACE:
2312      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2313    
2314      case OP_HSPACE:
2315      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2316    
2317      case OP_NOT_HSPACE:
2318      return next == -ESC_h;
2319    
2320      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2321      case OP_VSPACE:
2322      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2323    
2324      case OP_NOT_VSPACE:
2325      return next == -ESC_v;
2326    
2327      case OP_WORDCHAR:
2328      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2329    
2330      case OP_NOT_WORDCHAR:
2331      return next == -ESC_w || next == -ESC_d;
2332    
2333      default:
2334      return FALSE;
2335      }
2336    
2337    /* Control does not reach here */
2338    }
2339    
2340    
2341    
2342    /*************************************************
2343    *           Compile one branch                   *
2344    *************************************************/
2345    
2346    /* Scan the pattern, compiling it into the a vector. If the options are
2347    changed during the branch, the pointer is used to change the external options
2348    bits. This function is used during the pre-compile phase when we are trying
2349    to find out the amount of memory needed, as well as during the real compile
2350    phase. The value of lengthptr distinguishes the two phases.
2351    
2352    Arguments:
2353      optionsptr     pointer to the option bits
2354      codeptr        points to the pointer to the current code point
2355      ptrptr         points to the current pattern pointer
2356      errorcodeptr   points to error code variable
2357      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2358      reqbyteptr     set to the last literal character required, else < 0
2359      bcptr          points to current branch chain
2360      cd             contains pointers to tables etc.
2361      lengthptr      NULL during the real compile phase
2362                     points to length accumulator during pre-compile phase
2363    
2364    Returns:         TRUE on success
2365                     FALSE, with *errorcodeptr set non-zero on error
2366    */
2367    
2368    static BOOL
2369    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2370      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2371      compile_data *cd, int *lengthptr)
2372    {
2373    int repeat_type, op_type;
2374    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2375    int bravalue = 0;
2376    int greedy_default, greedy_non_default;
2377    int firstbyte, reqbyte;
2378    int zeroreqbyte, zerofirstbyte;
2379    int req_caseopt, reqvary, tempreqvary;
2380    int options = *optionsptr;
2381    int after_manual_callout = 0;
2382    int length_prevgroup = 0;
2383    register int c;
2384    register uschar *code = *codeptr;
2385    uschar *last_code = code;
2386    uschar *orig_code = code;
2387    uschar *tempcode;
2388    BOOL inescq = FALSE;
2389    BOOL groupsetfirstbyte = FALSE;
2390    const uschar *ptr = *ptrptr;
2391    const uschar *tempptr;
2392    uschar *previous = NULL;
2393    uschar *previous_callout = NULL;
2394    uschar *save_hwm = NULL;
2395    uschar classbits[32];
2396    
2397    #ifdef SUPPORT_UTF8
2398    BOOL class_utf8;
2399    BOOL utf8 = (options & PCRE_UTF8) != 0;
2400    uschar *class_utf8data;
2401    uschar *class_utf8data_base;
2402    uschar utf8_char[6];
2403    #else
2404    BOOL utf8 = FALSE;
2405    uschar *utf8_char = NULL;
2406    #endif
2407    
2408    #ifdef DEBUG
2409    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2410    #endif
2411    
2412    /* Set up the default and non-default settings for greediness */
2413    
2414    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2415    greedy_non_default = greedy_default ^ 1;
2416    
2417    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2418    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2419    matches a non-fixed char first char; reqbyte just remains unset if we never
2420    find one.
2421    
2422    When we hit a repeat whose minimum is zero, we may have to adjust these values
2423    to take the zero repeat into account. This is implemented by setting them to
2424    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2425    item types that can be repeated set these backoff variables appropriately. */
2426    
2427    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2428    
2429    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2430    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2431    value > 255. It is added into the firstbyte or reqbyte variables to record the
2432    case status of the value. This is used only for ASCII characters. */
2433    
2434    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2435    
2436    /* Switch on next character until the end of the branch */
2437    
2438    for (;; ptr++)
2439      {
2440      BOOL negate_class;
2441      BOOL should_flip_negation;
2442      BOOL possessive_quantifier;
2443      BOOL is_quantifier;
2444      BOOL is_recurse;
2445      BOOL reset_bracount;
2446      int class_charcount;
2447      int class_lastchar;
2448      int newoptions;
2449    int recno;    int recno;
2450      int refsign;
2451    int skipbytes;    int skipbytes;
2452    int subreqbyte;    int subreqbyte;
2453    int subfirstbyte;    int subfirstbyte;
2454      int terminator;
2455    int mclength;    int mclength;
2456    uschar mcbuffer[8];    uschar mcbuffer[8];
2457    
2458    /* Next byte in the pattern */    /* Get next byte in the pattern */
2459    
2460    c = *ptr;    c = *ptr;
2461    
2462      /* If we are in the pre-compile phase, accumulate the length used for the
2463      previous cycle of this loop. */
2464    
2465      if (lengthptr != NULL)
2466        {
2467    #ifdef DEBUG
2468        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2469    #endif
2470        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2471          {
2472          *errorcodeptr = ERR52;
2473          goto FAILED;
2474          }
2475    
2476        /* There is at least one situation where code goes backwards: this is the
2477        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2478        the class is simply eliminated. However, it is created first, so we have to
2479        allow memory for it. Therefore, don't ever reduce the length at this point.
2480        */
2481    
2482        if (code < last_code) code = last_code;
2483    
2484        /* Paranoid check for integer overflow */
2485    
2486        if (OFLOW_MAX - *lengthptr < code - last_code)
2487          {
2488          *errorcodeptr = ERR20;
2489          goto FAILED;
2490          }
2491    
2492        *lengthptr += code - last_code;
2493        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2494    
2495        /* If "previous" is set and it is not at the start of the work space, move
2496        it back to there, in order to avoid filling up the work space. Otherwise,
2497        if "previous" is NULL, reset the current code pointer to the start. */
2498    
2499        if (previous != NULL)
2500          {
2501          if (previous > orig_code)
2502            {
2503            memmove(orig_code, previous, code - previous);
2504            code -= previous - orig_code;
2505            previous = orig_code;
2506            }
2507          }
2508        else code = orig_code;
2509    
2510        /* Remember where this code item starts so we can pick up the length
2511        next time round. */
2512    
2513        last_code = code;
2514        }
2515    
2516      /* In the real compile phase, just check the workspace used by the forward
2517      reference list. */
2518    
2519      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2520        {
2521        *errorcodeptr = ERR52;
2522        goto FAILED;
2523        }
2524    
2525    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2526    
2527    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1692  for (;; ptr++) Line 2536  for (;; ptr++)
2536        {        {
2537        if (previous_callout != NULL)        if (previous_callout != NULL)
2538          {          {
2539          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2540              complete_callout(previous_callout, ptr, cd);
2541          previous_callout = NULL;          previous_callout = NULL;
2542          }          }
2543        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2558  for (;; ptr++)
2558    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2559         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2560      {      {
2561      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2562          complete_callout(previous_callout, ptr, cd);
2563      previous_callout = NULL;      previous_callout = NULL;
2564      }      }
2565    
# Line 1724  for (;; ptr++) Line 2570  for (;; ptr++)
2570      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2571      if (c == '#')      if (c == '#')
2572        {        {
2573        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2574          {          {
2575          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2576          }          }
2577          if (*ptr != 0) continue;
2578    
2579        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2580        c = 0;        c = 0;
2581        }        }
# Line 1745  for (;; ptr++) Line 2591  for (;; ptr++)
2591    
2592    switch(c)    switch(c)
2593      {      {
2594      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2595        case 0:                        /* The branch terminates at string end */
2596      case 0:      case '|':                      /* or | or ) */
     case '|':  
2597      case ')':      case ')':
2598      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2599      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2600      *codeptr = code;      *codeptr = code;
2601      *ptrptr = ptr;      *ptrptr = ptr;
2602        if (lengthptr != NULL)
2603          {
2604          if (OFLOW_MAX - *lengthptr < code - last_code)
2605            {
2606            *errorcodeptr = ERR20;
2607            goto FAILED;
2608            }
2609          *lengthptr += code - last_code;   /* To include callout length */
2610          DPRINTF((">> end branch\n"));
2611          }
2612      return TRUE;      return TRUE;
2613    
2614    
2615        /* ===================================================================*/
2616      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2617      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2618    
# Line 1784  for (;; ptr++) Line 2641  for (;; ptr++)
2641      *code++ = OP_ANY;      *code++ = OP_ANY;
2642      break;      break;
2643    
2644    
2645        /* ===================================================================*/
2646      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2647      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2648      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1803  for (;; ptr++) Line 2662  for (;; ptr++)
2662      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2663    
2664      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2665          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2666        {        {
2667        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2668        goto FAILED;        goto FAILED;
2669        }        }
2670    
2671      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2672        if the first few characters (either before or after ^) are \Q\E or \E we
2673        skip them too. This makes for compatibility with Perl. */
2674    
2675      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2676        for (;;)
2677        {        {
       negate_class = TRUE;  
2678        c = *(++ptr);        c = *(++ptr);
2679          if (c == '\\')
2680            {
2681            if (ptr[1] == 'E') ptr++;
2682              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2683                else break;
2684            }
2685          else if (!negate_class && c == '^')
2686            negate_class = TRUE;
2687          else break;
2688        }        }
2689      else  
2690        {      /* If a class contains a negative special such as \S, we need to flip the
2691        negate_class = FALSE;      negation flag at the end, so that support for characters > 255 works
2692        }      correctly (they are all included in the class). */
2693    
2694        should_flip_negation = FALSE;
2695    
2696      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2697      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2698      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2699    
2700      class_charcount = 0;      class_charcount = 0;
2701      class_lastchar = -1;      class_lastchar = -1;
2702    
2703        /* Initialize the 32-char bit map to all zeros. We build the map in a
2704        temporary bit of memory, in case the class contains only 1 character (less
2705        than 256), because in that case the compiled code doesn't use the bit map.
2706        */
2707    
2708        memset(classbits, 0, 32 * sizeof(uschar));
2709    
2710  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2711      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2712      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2713        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2714  #endif  #endif
2715    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2716      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2717      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2718      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2719    
2720      do      if (c != 0) do
2721        {        {
2722          const uschar *oldptr;
2723    
2724  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2725        if (utf8 && c > 127)        if (utf8 && c > 127)
2726          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2727          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2728          }          }
2729    
2730          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2731          data and reset the pointer. This is so that very large classes that
2732          contain a zillion UTF-8 characters no longer overwrite the work space
2733          (which is on the stack). */
2734    
2735          if (lengthptr != NULL)
2736            {
2737            *lengthptr += class_utf8data - class_utf8data_base;
2738            class_utf8data = class_utf8data_base;
2739            }
2740    
2741  #endif  #endif
2742    
2743        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2744    
2745        if (inescq)        if (inescq)
2746          {          {
2747          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2748            {            {
2749            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2750            ptr++;            ptr++;                            /* Skip the 'E' */
2751            continue;            continue;                         /* Carry on with next */
2752            }            }
2753          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2754          }          }
2755    
2756        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1876  for (;; ptr++) Line 2761  for (;; ptr++)
2761    
2762        if (c == '[' &&        if (c == '[' &&
2763            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2764            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2765          {          {
2766          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2767          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 1893  for (;; ptr++) Line 2778  for (;; ptr++)
2778          if (*ptr == '^')          if (*ptr == '^')
2779            {            {
2780            local_negate = TRUE;            local_negate = TRUE;
2781              should_flip_negation = TRUE;  /* Note negative special */
2782            ptr++;            ptr++;
2783            }            }
2784    
# Line 1956  for (;; ptr++) Line 2842  for (;; ptr++)
2842          }          }
2843    
2844        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2845        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2846        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2847        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2848        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2849        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2850    
2851        if (c == '\\')        if (c == '\\')
2852          {          {
2853          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2854            if (*errorcodeptr != 0) goto FAILED;
2855    
2856          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2857          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2858            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2859          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2860            {            {
2861            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1978  for (;; ptr++) Line 2865  for (;; ptr++)
2865            else inescq = TRUE;            else inescq = TRUE;
2866            continue;            continue;
2867            }            }
2868            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2869    
2870          if (c < 0)          if (c < 0)
2871            {            {
2872            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2873            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2874            switch (-c)  
2875              /* Save time by not doing this in the pre-compile phase. */
2876    
2877              if (lengthptr == NULL) switch (-c)
2878              {              {
2879              case ESC_d:              case ESC_d:
2880              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2881              continue;              continue;
2882    
2883              case ESC_D:              case ESC_D:
2884                should_flip_negation = TRUE;
2885              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2886              continue;              continue;
2887    
# Line 1998  for (;; ptr++) Line 2890  for (;; ptr++)
2890              continue;              continue;
2891    
2892              case ESC_W:              case ESC_W:
2893                should_flip_negation = TRUE;
2894              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2895              continue;              continue;
2896    
# Line 2007  for (;; ptr++) Line 2900  for (;; ptr++)
2900              continue;              continue;
2901    
2902              case ESC_S:              case ESC_S:
2903                should_flip_negation = TRUE;
2904              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2905              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2906              continue;              continue;
2907    
2908  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2909              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2910              case ESC_P:              }
2911    
2912              /* In the pre-compile phase, just do the recognition. */
2913    
2914              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2915                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2916    
2917              /* We need to deal with \H, \h, \V, and \v in both phases because
2918              they use extra memory. */
2919    
2920              if (-c == ESC_h)
2921                {
2922                SETBIT(classbits, 0x09); /* VT */
2923                SETBIT(classbits, 0x20); /* SPACE */
2924                SETBIT(classbits, 0xa0); /* NSBP */
2925    #ifdef SUPPORT_UTF8
2926                if (utf8)
2927                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2928                class_utf8 = TRUE;                class_utf8 = TRUE;
2929                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2930                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2931                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2932                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2933                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2934                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2935                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2936                  *class_utf8data++ = XCL_SINGLE;
2937                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2938                  *class_utf8data++ = XCL_SINGLE;
2939                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2940                  *class_utf8data++ = XCL_SINGLE;
2941                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2942                }                }
             continue;  
2943  #endif  #endif
2944                continue;
2945                }
2946    
2947              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2948              strict mode. By default, for compatibility with Perl, they are              {
2949              treated as literals. */              for (c = 0; c < 32; c++)
2950                  {
2951                  int x = 0xff;
2952                  switch (c)
2953                    {
2954                    case 0x09/8: x ^= 1 << (0x09%8); break;
2955                    case 0x20/8: x ^= 1 << (0x20%8); break;
2956                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2957                    default: break;
2958                    }
2959                  classbits[c] |= x;
2960                  }
2961    
2962              default:  #ifdef SUPPORT_UTF8
2963              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2964                {                {
2965                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2966                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2967                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2968                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2969                  *class_utf8data++ = XCL_RANGE;
2970                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2971                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2972                  *class_utf8data++ = XCL_RANGE;
2973                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2974                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2975                  *class_utf8data++ = XCL_RANGE;
2976                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2977                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2978                  *class_utf8data++ = XCL_RANGE;
2979                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2980                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2981                  *class_utf8data++ = XCL_RANGE;
2982                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2983                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2984                  *class_utf8data++ = XCL_RANGE;
2985                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2986                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2987                }                }
2988              c = *ptr;              /* The final character */  #endif
2989              class_charcount -= 2;  /* Undo the default count from above */              continue;
2990              }              }
2991            }  
2992              if (-c == ESC_v)
2993          /* Fall through if we have a single character (c >= 0). This may be              {
2994          > 256 in UTF-8 mode. */              SETBIT(classbits, 0x0a); /* LF */
2995                SETBIT(classbits, 0x0b); /* VT */
2996                SETBIT(classbits, 0x0c); /* FF */
2997                SETBIT(classbits, 0x0d); /* CR */
2998                SETBIT(classbits, 0x85); /* NEL */
2999    #ifdef SUPPORT_UTF8
3000                if (utf8)
3001                  {
3002                  class_utf8 = TRUE;
3003                  *class_utf8data++ = XCL_RANGE;
3004                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3005                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3006                  }
3007    #endif
3008                continue;
3009                }
3010    
3011              if (-c == ESC_V)
3012                {
3013                for (c = 0; c < 32; c++)
3014                  {
3015                  int x = 0xff;
3016                  switch (c)
3017                    {
3018                    case 0x0a/8: x ^= 1 << (0x0a%8);
3019                                 x ^= 1 << (0x0b%8);
3020                                 x ^= 1 << (0x0c%8);
3021                                 x ^= 1 << (0x0d%8);
3022                                 break;
3023                    case 0x85/8: x ^= 1 << (0x85%8); break;
3024                    default: break;
3025                    }
3026                  classbits[c] |= x;
3027                  }
3028    
3029    #ifdef SUPPORT_UTF8
3030                if (utf8)
3031                  {
3032                  class_utf8 = TRUE;
3033                  *class_utf8data++ = XCL_RANGE;
3034                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3035                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3036                  *class_utf8data++ = XCL_RANGE;
3037                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3038                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3039                  }
3040    #endif
3041                continue;
3042                }
3043    
3044              /* We need to deal with \P and \p in both phases. */
3045    
3046    #ifdef SUPPORT_UCP
3047              if (-c == ESC_p || -c == ESC_P)
3048                {
3049                BOOL negated;
3050                int pdata;
3051                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3052                if (ptype < 0) goto FAILED;
3053                class_utf8 = TRUE;
3054                *class_utf8data++ = ((-c == ESC_p) != negated)?
3055                  XCL_PROP : XCL_NOTPROP;
3056                *class_utf8data++ = ptype;
3057                *class_utf8data++ = pdata;
3058                class_charcount -= 2;   /* Not a < 256 character */
3059                continue;
3060                }
3061    #endif
3062              /* Unrecognized escapes are faulted if PCRE is running in its
3063              strict mode. By default, for compatibility with Perl, they are
3064              treated as literals. */
3065    
3066              if ((options & PCRE_EXTRA) != 0)
3067                {
3068                *errorcodeptr = ERR7;
3069                goto FAILED;
3070                }
3071    
3072              class_charcount -= 2;  /* Undo the default count from above */
3073              c = *ptr;              /* Get the final character and fall through */
3074              }
3075    
3076            /* Fall through if we have a single character (c >= 0). This may be
3077            greater than 256 in UTF-8 mode. */
3078    
3079          }   /* End of backslash handling */          }   /* End of backslash handling */
3080    
3081        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
3082        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
3083        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
3084          entirely. The code for handling \Q and \E is messy. */
3085    
3086          CHECK_RANGE:
3087          while (ptr[1] == '\\' && ptr[2] == 'E')
3088            {
3089            inescq = FALSE;
3090            ptr += 2;
3091            }
3092    
3093          oldptr = ptr;
3094    
3095        if (ptr[1] == '-' && ptr[2] != ']')        /* Remember \r or \n */
3096    
3097          if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3098    
3099          /* Check for range */
3100    
3101          if (!inescq && ptr[1] == '-')
3102          {          {
3103          int d;          int d;
3104          ptr += 2;          ptr += 2;
3105            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3106    
3107            /* If we hit \Q (not followed by \E) at this point, go into escaped
3108            mode. */
3109    
3110            while (*ptr == '\\' && ptr[1] == 'Q')
3111              {
3112              ptr += 2;
3113              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3114              inescq = TRUE;
3115              break;
3116              }
3117    
3118            if (*ptr == 0 || (!inescq && *ptr == ']'))
3119              {
3120              ptr = oldptr;
3121              goto LONE_SINGLE_CHARACTER;
3122              }
3123    
3124  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3125          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 3134  for (;; ptr++)
3134          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3135          in such circumstances. */          in such circumstances. */
3136    
3137          if (d == '\\')          if (!inescq && d == '\\')
3138            {            {
3139            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3140            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3141    
3142            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3143            was literal */            special means the '-' was literal */
3144    
3145            if (d < 0)            if (d < 0)
3146              {              {
3147              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3148              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3149                else if (d == -ESC_R) d = 'R'; else
3150                {                {
3151                ptr = oldptr - 2;                ptr = oldptr;
3152                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3153                }                }
3154              }              }
3155            }            }
3156    
3157          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3158          the pre-pass. Optimize one-character ranges */          one-character ranges */
3159    
3160            if (d < c)
3161              {
3162              *errorcodeptr = ERR8;
3163              goto FAILED;
3164              }
3165    
3166          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3167    
3168            /* Remember \r or \n */
3169    
3170            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3171    
3172          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3173          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3174          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2112  for (;; ptr++) Line 3186  for (;; ptr++)
3186  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3187            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3188              {              {
3189              int occ, ocd;              unsigned int occ, ocd;
3190              int cc = c;              unsigned int cc = c;
3191              int origd = d;              unsigned int origd = d;
3192              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3193                {                {
3194                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3195                      ocd <= (unsigned int)d)
3196                    continue;                          /* Skip embedded ranges */
3197    
3198                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3199                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3200                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3201                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3202                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3203                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3204                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3205                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3206                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3207                  d = ocd;                  d = ocd;
3208                  continue;                  continue;
# Line 2172  for (;; ptr++) Line 3250  for (;; ptr++)
3250          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3251          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3252    
3253          for (; c <= d; c++)          class_charcount += d - c + 1;
3254            class_lastchar = d;
3255    
3256            /* We can save a bit of time by skipping this in the pre-compile. */
3257    
3258            if (lengthptr == NULL) for (; c <= d; c++)
3259            {            {
3260            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3261            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 3263  for (;; ptr++)
3263              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3264              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3265              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3266            }            }
3267    
3268          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 3286  for (;; ptr++)
3286  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3287          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3288            {            {
3289            int othercase;            unsigned int othercase;
3290            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3291              {              {
3292              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3293              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 3312  for (;; ptr++)
3312          }          }
3313        }        }
3314    
3315      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3316      loop. This "while" is the end of the "do" above. */  
3317        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3318    
3319        if (c == 0)                          /* Missing terminating ']' */
3320          {
3321          *errorcodeptr = ERR6;
3322          goto FAILED;
3323          }
3324    
3325    
3326    /* This code has been disabled because it would mean that \s counts as
3327    an explicit \r or \n reference, and that's not really what is wanted. Now
3328    we set the flag only if there is a literal "\r" or "\n" in the class. */
3329    
3330    #if 0
3331        /* Remember whether \r or \n are in this class */
3332    
3333        if (negate_class)
3334          {
3335          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3336          }
3337        else
3338          {
3339          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3340          }
3341    #endif
3342    
     while ((c = *(++ptr)) != ']' || inescq);  
3343    
3344      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3345      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3346      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3347      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3348      single-bytes only. This is an historical hangover. Maybe one day we can  
3349      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3350        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3351        operate on single-bytes only. This is an historical hangover. Maybe one day
3352        we can tidy these opcodes to handle multi-byte characters.
3353    
3354      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3355      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2251  for (;; ptr++) Line 3359  for (;; ptr++)
3359      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3360    
3361  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3362      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3363            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3364  #else  #else
3365      if (class_charcount == 1)      if (class_charcount == 1)
3366  #endif  #endif
# Line 2297  for (;; ptr++) Line 3403  for (;; ptr++)
3403      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3404    
3405      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3406      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3407      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3408        the class, so any that were explicitly given as well can be ignored. If
3409        (when there are explicit characters > 255 that must be listed) there are no
3410        characters < 256, we can omit the bitmap in the actual compiled code. */
3411    
3412  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3413      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3414        {        {
3415        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3416        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3417        code += LINK_SIZE;        code += LINK_SIZE;
3418        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3419    
3420        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3421        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3422    
3423        if (class_charcount > 0)        if (class_charcount > 0)
3424          {          {
3425          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3426            memmove(code + 32, code, class_utf8data - code);
3427          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3428          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3429          }          }
3430          else code = class_utf8data;
3431    
3432        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3433    
# Line 2334  for (;; ptr++) Line 3436  for (;; ptr++)
3436        }        }
3437  #endif  #endif
3438    
3439      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3440      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3441      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3442      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3443    
3444        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3445      if (negate_class)      if (negate_class)
3446        {        {
3447        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3448        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3449        }        }
3450      else      else
3451        {        {
       *code++ = OP_CLASS;  
3452        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3453        }        }
3454      code += 32;      code += 32;
3455      break;      break;
3456    
3457    
3458        /* ===================================================================*/
3459      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3460      has been tested above. */      has been tested above. */
3461    
# Line 2419  for (;; ptr++) Line 3523  for (;; ptr++)
3523        }        }
3524      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3525    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3526      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3527      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3528      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3556  for (;; ptr++)
3556          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3557          }          }
3558    
3559          /* If the repetition is unlimited, it pays to see if the next thing on
3560          the line is something that cannot possibly match this character. If so,
3561          automatically possessifying this item gains some performance in the case
3562          where the match fails. */
3563    
3564          if (!possessive_quantifier &&
3565              repeat_max < 0 &&
3566              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3567                options, cd))
3568            {
3569            repeat_type = 0;    /* Force greedy */
3570            possessive_quantifier = TRUE;
3571            }
3572    
3573        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3574        }        }
3575    
3576      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3577      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3578      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3579      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3580        currently used only for single-byte chars. */
3581    
3582      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3583        {        {
3584        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3585        c = previous[1];        c = previous[1];
3586          if (!possessive_quantifier &&
3587              repeat_max < 0 &&
3588              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3589            {
3590            repeat_type = 0;    /* Force greedy */
3591            possessive_quantifier = TRUE;
3592            }
3593        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3594        }        }
3595    
# Line 2495  for (;; ptr++) Line 3607  for (;; ptr++)
3607        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3608        c = *previous;        c = *previous;
3609    
3610          if (!possessive_quantifier &&
3611              repeat_max < 0 &&
3612              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3613            {
3614            repeat_type = 0;    /* Force greedy */
3615            possessive_quantifier = TRUE;
3616            }
3617    
3618        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3619        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3620          {          {
# Line 2514  for (;; ptr++) Line 3634  for (;; ptr++)
3634        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3635        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3636    
3637        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3638    
3639        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3640    
# Line 2535  for (;; ptr++) Line 3655  for (;; ptr++)
3655          }          }
3656    
3657        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3658        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3659        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3660        one less than the maximum. */        one less than the maximum. */
3661    
# Line 2588  for (;; ptr++) Line 3708  for (;; ptr++)
3708            }            }
3709    
3710          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3711          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3712            UPTO is just for 1 instance, we can use QUERY instead. */
3713    
3714          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3715            {            {
# Line 2607  for (;; ptr++) Line 3728  for (;; ptr++)
3728              *code++ = prop_value;              *code++ = prop_value;
3729              }              }
3730            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3731            *code++ = OP_UPTO + repeat_type;  
3732            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3733                {
3734                *code++ = OP_QUERY + repeat_type;
3735                }
3736              else
3737                {
3738                *code++ = OP_UPTO + repeat_type;
3739                PUT2INC(code, 0, repeat_max);
3740                }
3741            }            }
3742          }          }
3743    
# Line 2655  for (;; ptr++) Line 3784  for (;; ptr++)
3784        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3785        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3786    
3787        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3788    
3789        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3790          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2675  for (;; ptr++) Line 3804  for (;; ptr++)
3804      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3805      cases. */      cases. */
3806    
3807      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3808               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3809        {        {
3810        register int i;        register int i;
3811        int ketoffset = 0;        int ketoffset = 0;
3812        int len = code - previous;        int len = code - previous;
3813        uschar *bralink = NULL;        uschar *bralink = NULL;
3814    
3815          /* Repeating a DEFINE group is pointless */
3816    
3817          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3818            {
3819            *errorcodeptr = ERR55;
3820            goto FAILED;
3821            }
3822    
3823        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3824        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3825        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3854  for (;; ptr++)
3854          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3855          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3856          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3857          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3858          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3859            doing this. */
3860    
3861          if (repeat_max <= 1)          if (repeat_max <= 1)
3862            {            {
3863            *code = OP_END;            *code = OP_END;
3864            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3865            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3866            code++;            code++;
3867            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3879  for (;; ptr++)
3879            {            {
3880            int offset;            int offset;
3881            *code = OP_END;            *code = OP_END;
3882            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3883            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3884            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3885            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3899  for (;; ptr++)
3899        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3900        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3901        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3902        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3903          forward reference subroutine calls in the group, there will be entries on
3904          the workspace list; replicate these with an appropriate increment. */
3905    
3906        else        else
3907          {          {
3908          if (repeat_min > 1)          if (repeat_min > 1)
3909            {            {
3910            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3911            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3912              potential integer overflow. */
3913    
3914              if (lengthptr != NULL)
3915                {
3916                int delta = (repeat_min - 1)*length_prevgroup;
3917                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3918                                                                (double)INT_MAX ||
3919                    OFLOW_MAX - *lengthptr < delta)
3920                  {
3921                  *errorcodeptr = ERR20;
3922                  goto FAILED;
3923                  }
3924                *lengthptr += delta;
3925                }
3926    
3927              /* This is compiling for real */
3928    
3929              else
3930              {              {
3931              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3932              code += len;              for (i = 1; i < repeat_min; i++)
3933                  {
3934                  uschar *hc;
3935                  uschar *this_hwm = cd->hwm;
3936                  memcpy(code, previous, len);
3937                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3938                    {
3939                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3940                    cd->hwm += LINK_SIZE;
3941                    }
3942                  save_hwm = this_hwm;
3943                  code += len;
3944                  }
3945              }              }
3946            }            }
3947    
3948          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3949          }          }
3950    
# Line 2781  for (;; ptr++) Line 3952  for (;; ptr++)
3952        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3953        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3954        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3955        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3956          replicate entries on the forward reference list. */
3957    
3958        if (repeat_max >= 0)        if (repeat_max >= 0)
3959          {          {
3960          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3961            just adjust the length as if we had. For each repetition we must add 1
3962            to the length for BRAZERO and for all but the last repetition we must
3963            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3964            paranoid checks to avoid integer overflow. */
3965    
3966            if (lengthptr != NULL && repeat_max > 0)
3967              {
3968              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3969                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3970              if ((double)repeat_max *
3971                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3972                      > (double)INT_MAX ||
3973                  OFLOW_MAX - *lengthptr < delta)
3974                {
3975                *errorcodeptr = ERR20;
3976                goto FAILED;
3977                }
3978              *lengthptr += delta;
3979              }
3980    
3981            /* This is compiling for real */
3982    
3983            else for (i = repeat_max - 1; i >= 0; i--)
3984            {            {
3985              uschar *hc;
3986              uschar *this_hwm = cd->hwm;
3987    
3988            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3989    
3990            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 4000  for (;; ptr++)
4000              }              }
4001    
4002            memcpy(code, previous, len);            memcpy(code, previous, len);
4003              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4004                {
4005                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4006                cd->hwm += LINK_SIZE;
4007                }
4008              save_hwm = this_hwm;
4009            code += len;            code += len;
4010            }            }
4011    
# Line 2824  for (;; ptr++) Line 4028  for (;; ptr++)
4028        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4029        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4030        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4031        correct offset was computed above. */        correct offset was computed above.
4032    
4033        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
4034          this group is a non-atomic one that could match an empty string. If so,
4035          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4036          that runtime checking can be done. [This check is also applied to
4037          atomic groups at runtime, but in a different way.] */
4038    
4039          else
4040            {
4041            uschar *ketcode = code - ketoffset;
4042            uschar *bracode = ketcode - GET(ketcode, 1);
4043            *ketcode = OP_KETRMAX + repeat_type;
4044            if (lengthptr == NULL && *bracode != OP_ONCE)
4045              {
4046              uschar *scode = bracode;
4047              do
4048                {
4049                if (could_be_empty_branch(scode, ketcode, utf8))
4050                  {
4051                  *bracode += OP_SBRA - OP_BRA;
4052                  break;
4053                  }
4054                scode += GET(scode, 1);
4055                }
4056              while (*scode == OP_ALT);
4057              }
4058            }
4059        }        }
4060    
4061      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 4066  for (;; ptr++)
4066        goto FAILED;        goto FAILED;
4067        }        }
4068    
4069      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4070      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4071      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4072      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4073      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4074        but the special opcodes can optimize it a bit. The repeated item starts at
4075        tempcode, not at previous, which might be the first part of a string whose
4076        (former) last char we repeated.
4077    
4078        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4079        an 'upto' may follow. We skip over an 'exact' item, and then test the
4080        length of what remains before proceeding. */
4081    
4082      if (possessive_quantifier)      if (possessive_quantifier)
4083        {        {
4084        int len = code - tempcode;        int len;
4085        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4086        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4087        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4088        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4089        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4090        PUTINC(code, 0, len);        len = code - tempcode;
4091        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4092            {
4093            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4094            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4095            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4096            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4097    
4098            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4099            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4100            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4101            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4102    
4103            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4104            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4105            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4106            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4107    
4108            default:
4109            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4110            code += 1 + LINK_SIZE;
4111            len += 1 + LINK_SIZE;
4112            tempcode[0] = OP_ONCE;
4113            *code++ = OP_KET;
4114            PUTINC(code, 0, len);
4115            PUT(tempcode, 1, len);
4116            break;
4117            }
4118        }        }
4119    
4120      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 4127  for (;; ptr++)
4127      break;      break;
4128    
4129    
4130      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4131      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4132      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4133      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4134    
4135      case '(':      case '(':
4136      newoptions = options;      newoptions = options;
4137      skipbytes = 0;      skipbytes = 0;
4138        bravalue = OP_CBRA;
4139        save_hwm = cd->hwm;
4140        reset_bracount = FALSE;
4141    
4142        /* First deal with various "verbs" that can be introduced by '*'. */
4143    
4144        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4145          {
4146          int i, namelen;
4147          const char *vn = verbnames;
4148          const uschar *name = ++ptr;
4149          previous = NULL;
4150          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4151          if (*ptr == ':')
4152            {
4153            *errorcodeptr = ERR59;   /* Not supported */
4154            goto FAILED;
4155            }
4156          if (*ptr != ')')
4157            {
4158            *errorcodeptr = ERR60;
4159            goto FAILED;
4160            }
4161          namelen = ptr - name;
4162          for (i = 0; i < verbcount; i++)
4163            {
4164            if (namelen == verbs[i].len &&
4165                strncmp((char *)name, vn, namelen) == 0)
4166              {
4167              *code = verbs[i].op;
4168              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4169              break;
4170              }
4171            vn += verbs[i].len + 1;
4172            }
4173          if (i < verbcount) continue;
4174          *errorcodeptr = ERR60;
4175          goto FAILED;
4176          }
4177    
4178        /* Deal with the extended parentheses; all are introduced by '?', and the
4179        appearance of any of them means that this is not a capturing group. */
4180    
4181      if (*(++ptr) == '?')      else if (*ptr == '?')
4182        {        {
4183        int set, unset;        int i, set, unset, namelen;
4184        int *optset;        int *optset;
4185          const uschar *name;
4186          uschar *slot;
4187    
4188        switch (*(++ptr))        switch (*(++ptr))
4189          {          {
4190          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4191          ptr++;          ptr++;
4192          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4193            if (*ptr == 0)
4194              {
4195              *errorcodeptr = ERR18;
4196              goto FAILED;
4197              }
4198          continue;          continue;
4199    
4200          case ':':                 /* Non-extracting bracket */  
4201            /* ------------------------------------------------------------ */
4202            case '|':                 /* Reset capture count for each branch */
4203            reset_bracount = TRUE;
4204            /* Fall through */
4205    
4206            /* ------------------------------------------------------------ */
4207            case ':':                 /* Non-capturing bracket */
4208          bravalue = OP_BRA;          bravalue = OP_BRA;
4209          ptr++;          ptr++;
4210          break;          break;
4211    
4212    
4213            /* ------------------------------------------------------------ */
4214          case '(':          case '(':
4215          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4216    
4217          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
4218          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
4219          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
4220          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
4221          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
4222          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4223          character after (?( is a word character, we know the rest up to ) will  
4224          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
4225          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
4226            by digits), and (b) a number could be a name that consists of digits.
4227          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
4228            {          cases. */
4229            int i, namelen;  
4230            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
4231            const uschar *name;          the switch. This will take control down to where bracketed groups,
4232            uschar *slot = cd->name_table;          including assertions, are processed. */
4233    
4234            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4235              break;
4236    
4237            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
4238            below), and all need to skip 3 bytes at the start of the group. */
4239    
4240            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
4241            skipbytes = 3;
4242            refsign = -1;
4243    
4244            name = ++ptr;          /* Check for a test for recursion in a named group. */
4245            while (*ptr != ')')  
4246              {          if (ptr[1] == 'R' && ptr[2] == '&')
4247              if (condref >= 0)            {
4248                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
4249                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
4250              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4251              }            }
4252            namelen = ptr - name;  
4253            /* Check for a test for a named group's having been set, using the Perl
4254            syntax (?(<name>) or (?('name') */
4255    
4256            else if (ptr[1] == '<')
4257              {
4258              terminator = '>';
4259              ptr++;
4260              }
4261            else if (ptr[1] == '\'')
4262              {
4263              terminator = '\'';
4264            ptr++;            ptr++;
4265              }
4266            else
4267              {
4268              terminator = 0;
4269              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4270              }
4271    
4272            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
4273    
4274            /* Found a previous named subpattern */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4275              {
4276              ptr += 1;  /* To get the right offset */
4277              *errorcodeptr = ERR28;
4278              goto FAILED;
4279              }
4280    
4281            if (i < cd->names_found)          /* Read the name, but also get it as a number if it's all digits */
             {  
             condref = GET2(slot, 0);  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, condref);  
             }  
4282    
4283            /* Search the pattern for a forward reference */          recno = 0;
4284            name = ++ptr;
4285            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4286              {
4287              if (recno >= 0)
4288                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4289                  recno * 10 + *ptr - '0' : -1;
4290              ptr++;
4291              }
4292            namelen = ptr - name;
4293    
4294            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4295              {            {
4296              code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
4297              PUT2(code, 2+LINK_SIZE, i);            *errorcodeptr = ERR26;
4298              }            goto FAILED;
4299              }
4300    
4301            /* Check for 'R' for recursion */          /* Do no further checking in the pre-compile phase. */
4302    
4303            else if (namelen == 1 && *name == 'R')          if (lengthptr != NULL) break;
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, CREF_RECURSE);  
             }  
4304    
4305            /* Check for a subpattern number */          /* In the real compile we do the work of looking for the actual
4306            reference. If the string started with "+" or "-" we require the rest to
4307            be digits, in which case recno will be set. */
4308    
4309            else if (condref > 0)          if (refsign > 0)
4310              {
4311              if (recno <= 0)
4312              {              {
4313              code[1+LINK_SIZE] = OP_CREF;              *errorcodeptr = ERR58;
4314              PUT2(code, 2+LINK_SIZE, condref);              goto FAILED;
4315              }              }
4316              recno = (refsign == '-')?
4317            /* Either an unidentified subpattern, or a reference to (?(0) */              cd->bracount - recno + 1 : recno +cd->bracount;
4318              if (recno <= 0 || recno > cd->final_bracount)
           else  
4319              {              {
4320              *errorcodeptr = (condref == 0)? ERR35: ERR15;              *errorcodeptr = ERR15;
4321              goto FAILED;              goto FAILED;
4322              }              }
4323              PUT2(code, 2+LINK_SIZE, recno);
4324              break;
4325            }            }
4326    
4327          /* For conditions that are assertions, we just fall through, having          /* Otherwise (did not start with "+" or "-"), start by looking for the
4328          set bravalue above. */          name. */
4329    
4330          break;          slot = cd->name_table;
4331            for (i = 0; i < cd->names_found; i++)
4332              {
4333              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4334              slot += cd->name_entry_size;
4335              }
4336    
4337          case '=':                 /* Positive lookahead */          /* Found a previous named subpattern */
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
4338    
4339          case '!':                 /* Negative lookahead */          if (i < cd->names_found)
4340          bravalue = OP_ASSERT_NOT;            {
4341          ptr++;            recno = GET2(slot, 0);
4342          break;            PUT2(code, 2+LINK_SIZE, recno);
4343              }
4344    
4345            /* Search the pattern for a forward reference */
4346    
4347          case '<':                 /* Lookbehinds */          else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4348          switch (*(++ptr))                          (options & PCRE_EXTENDED) != 0)) > 0)
4349            {            {
4350            case '=':               /* Positive lookbehind */            PUT2(code, 2+LINK_SIZE, i);
4351            bravalue = OP_ASSERTBACK;            }
           ptr++;  
           break;  
4352    
4353            case '!':               /* Negative lookbehind */          /* If terminator == 0 it means that the name followed directly after
4354            bravalue = OP_ASSERTBACK_NOT;          the opening parenthesis [e.g. (?(abc)...] and in this case there are
4355            ptr++;          some further alternatives to try. For the cases where terminator != 0
4356            break;          [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4357            now checked all the possibilities, so give an error. */
4358    
4359            else if (terminator != 0)
4360              {
4361              *errorcodeptr = ERR15;
4362              goto FAILED;
4363            }            }
         break;  
4364    
4365          case '>':                 /* One-time brackets */          /* Check for (?(R) for recursion. Allow digits after R to specify a
4366          bravalue = OP_ONCE;          specific group number. */
         ptr++;  
         break;  
4367    
4368          case 'C':                 /* Callout - may be followed by digits; */          else if (*name == 'R')
4369          previous_callout = code;  /* Save for later completion */            {
4370          after_manual_callout = 1; /* Skip one item before completing */            recno = 0;
4371          *code++ = OP_CALLOUT;     /* Already checked that the terminating */            for (i = 1; i < namelen; i++)
4372            {                       /* closing parenthesis is present. */              {
4373                if ((digitab[name[i]] & ctype_digit) == 0)
4374                  {
4375                  *errorcodeptr = ERR15;
4376                  goto FAILED;
4377                  }
4378                recno = recno * 10 + name[i] - '0';
4379                }
4380              if (recno == 0) recno = RREF_ANY;
4381              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4382              PUT2(code, 2+LINK_SIZE, recno);
4383              }
4384    
4385            /* Similarly, check for the (?(DEFINE) "condition", which is always
4386            false. */
4387    
4388            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4389              {
4390              code[1+LINK_SIZE] = OP_DEF;
4391              skipbytes = 1;
4392              }
4393    
4394            /* Check for the "name" actually being a subpattern number. We are
4395            in the second pass here, so final_bracount is set. */
4396    
4397            else if (recno > 0 && recno <= cd->final_bracount)
4398              {
4399              PUT2(code, 2+LINK_SIZE, recno);
4400              }
4401    
4402            /* Either an unidentified subpattern, or a reference to (?(0) */
4403    
4404            else
4405              {
4406              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4407              goto FAILED;
4408              }
4409            break;
4410    
4411    
4412            /* ------------------------------------------------------------ */
4413            case '=':                 /* Positive lookahead */
4414            bravalue = OP_ASSERT;
4415            ptr++;
4416            break;
4417    
4418    
4419            /* ------------------------------------------------------------ */
4420            case '!':                 /* Negative lookahead */
4421            ptr++;
4422            if (*ptr == ')')          /* Optimize (?!) */
4423              {
4424              *code++ = OP_FAIL;
4425              previous = NULL;
4426              continue;
4427              }
4428            bravalue = OP_ASSERT_NOT;
4429            break;
4430    
4431    
4432            /* ------------------------------------------------------------ */
4433            case '<':                 /* Lookbehind or named define */
4434            switch (ptr[1])
4435              {
4436              case '=':               /* Positive lookbehind */
4437              bravalue = OP_ASSERTBACK;
4438              ptr += 2;
4439              break;
4440    
4441              case '!':               /* Negative lookbehind */
4442              bravalue = OP_ASSERTBACK_NOT;
4443              ptr += 2;
4444              break;
4445    
4446              default:                /* Could be name define, else bad */
4447              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4448              ptr++;                  /* Correct offset for error */
4449              *errorcodeptr = ERR24;
4450              goto FAILED;
4451              }
4452            break;
4453    
4454    
4455            /* ------------------------------------------------------------ */
4456            case '>':                 /* One-time brackets */
4457            bravalue = OP_ONCE;
4458            ptr++;
4459            break;
4460    
4461    
4462            /* ------------------------------------------------------------ */
4463            case 'C':                 /* Callout - may be followed by digits; */
4464            previous_callout = code;  /* Save for later completion */
4465            after_manual_callout = 1; /* Skip one item before completing */
4466            *code++ = OP_CALLOUT;
4467              {
4468            int n = 0;            int n = 0;
4469            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4470              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4471              if (*ptr != ')')
4472                {
4473                *errorcodeptr = ERR39;
4474                goto FAILED;
4475                }
4476            if (n > 255)            if (n > 255)
4477              {              {
4478              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 4486  for (;; ptr++)
4486          previous = NULL;          previous = NULL;
4487          continue;          continue;
4488    
4489          case 'P':                 /* Named subpattern handling */  
4490          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4491            case 'P':                 /* Python-style named subpattern handling */
4492            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4493              {
4494              is_recurse = *ptr == '>';
4495              terminator = ')';
4496              goto NAMED_REF_OR_RECURSE;
4497              }
4498            else if (*ptr != '<')    /* Test for Python-style definition */
4499              {
4500              *errorcodeptr = ERR41;
4501              goto FAILED;
4502              }
4503            /* Fall through to handle (?P< as (?< is handled */
4504    
4505    
4506            /* ------------------------------------------------------------ */
4507            DEFINE_NAME:    /* Come here from (?< handling */
4508            case '\'':
4509            {            {
4510            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4511            uschar *slot = cd->name_table;            name = ++ptr;
           const uschar *name;     /* Don't amalgamate; some compilers */  
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4512    
4513            while (*ptr++ != '>');            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4514            namelen = ptr - name - 1;            namelen = ptr - name;
4515    
4516            for (i = 0; i < cd->names_found; i++)            /* In the pre-compile phase, just do a syntax check. */
4517    
4518              if (lengthptr != NULL)
4519              {              {
4520              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4521              if (crc == 0)                {
4522                  *errorcodeptr = ERR42;
4523                  goto FAILED;
4524                  }
4525                if (cd->names_found >= MAX_NAME_COUNT)
4526                {                {
4527                if (slot[2+namelen] == 0)                *errorcodeptr = ERR49;
4528                  goto FAILED;
4529                  }
4530                if (namelen + 3 > cd->name_entry_size)
4531                  {
4532                  cd->name_entry_size = namelen + 3;
4533                  if (namelen > MAX_NAME_SIZE)
4534                  {                  {
4535                  if ((options & PCRE_DUPNAMES) == 0)                  *errorcodeptr = ERR48;
4536                    {                  goto FAILED;
                   *errorcodeptr = ERR43;  
                   goto FAILED;  
                   }  
4537                  }                  }
               else crc = -1;      /* Current name is substring */  
4538                }                }
4539              if (crc < 0)              }
4540    
4541              /* In the real compile, create the entry in the table */
4542    
4543              else
4544                {
4545                slot = cd->name_table;
4546                for (i = 0; i < cd->names_found; i++)
4547                {                {
4548                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4549                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4550                break;                  {
4551                    if (slot[2+namelen] == 0)
4552                      {
4553                      if ((options & PCRE_DUPNAMES) == 0)
4554                        {
4555                        *errorcodeptr = ERR43;
4556                        goto FAILED;
4557                        }
4558                      }
4559                    else crc = -1;      /* Current name is substring */
4560                    }
4561                  if (crc < 0)
4562                    {
4563                    memmove(slot + cd->name_entry_size, slot,
4564                      (cd->names_found - i) * cd->name_entry_size);
4565                    break;
4566                    }
4567                  slot += cd->name_entry_size;
4568                }                }
             slot += cd->name_entry_size;  
             }  
4569    
4570            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4571            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4572            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4573            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4574            }            }
4575    
4576          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4577    
4578            ptr++;                    /* Move past > or ' */
4579            cd->names_found++;
4580            goto NUMBERED_GROUP;
4581    
4582    
4583            /* ------------------------------------------------------------ */
4584            case '&':                 /* Perl recursion/subroutine syntax */
4585            terminator = ')';
4586            is_recurse = TRUE;
4587            /* Fall through */
4588    
4589            /* We come here from the Python syntax above that handles both
4590            references (?P=name) and recursion (?P>name), as well as falling
4591            through from the Perl recursion syntax (?&name). We also come here from
4592            the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4593            .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4594    
4595            NAMED_REF_OR_RECURSE:
4596            name = ++ptr;
4597            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4598            namelen = ptr - name;
4599    
4600            /* In the pre-compile phase, do a syntax check and set a dummy
4601            reference number. */
4602    
4603            if (lengthptr != NULL)
4604            {            {
4605            int i, namelen;            if (namelen == 0)
4606            int type = *ptr++;              {
4607            const uschar *name = ptr;              *errorcodeptr = ERR62;
4608            uschar *slot = cd->name_table;              goto FAILED;
4609                }
4610              if (*ptr != terminator)
4611                {
4612                *errorcodeptr = ERR42;
4613                goto FAILED;
4614                }
4615              if (namelen > MAX_NAME_SIZE)
4616                {
4617                *errorcodeptr = ERR48;
4618                goto FAILED;
4619                }
4620              recno = 0;
4621              }
4622    
4623            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table. We check the name
4624            namelen = ptr - name;          first, and then check that we have reached the end of the name in the
4625            table. That way, if the name that is longer than any in the table,
4626            the comparison will fail without reading beyond the table entry. */
4627    
4628            else
4629              {
4630              slot = cd->name_table;
4631            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4632              {              {
4633              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4634                    slot[2+namelen] == 0)
4635                  break;
4636              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4637              }              }
4638    
# Line 3097  for (;; ptr++) Line 4641  for (;; ptr++)
4641              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4642              }              }
4643            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4644                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
4645                          (options & PCRE_EXTENDED) != 0)) <= 0)
4646              {              {
4647              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4648              goto FAILED;              goto FAILED;
4649              }              }
4650              }
4651    
4652            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4653            recursion or backreferences. */
           /* Back reference */  
4654    
4655            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4656            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4657    
         /* Should never happen */  
         break;  
4658    
4659          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4660            case 'R':                 /* Recursion */
4661          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4662          /* Fall through */          /* Fall through */
4663    
         /* Recursion or "subroutine" call */  
4664    
4665          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4666          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4667            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4668            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4669            {            {
4670            const uschar *called;            const uschar *called;
4671              terminator = ')';
4672    
4673              /* Come here from the \g<...> and \g'...' code (Oniguruma
4674              compatibility). However, the syntax has been checked to ensure that
4675              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4676              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4677              ever be taken. */
4678    
4679              HANDLE_NUMERICAL_RECURSION:
4680    
4681              if ((refsign = *ptr) == '+')
4682                {
4683                ptr++;
4684                if ((digitab[*ptr] & ctype_digit) == 0)
4685                  {
4686                  *errorcodeptr = ERR63;
4687                  goto FAILED;
4688                  }
4689                }
4690              else if (refsign == '-')
4691                {
4692                if ((digitab[ptr[1]] & ctype_digit) == 0)
4693                  goto OTHER_CHAR_AFTER_QUERY;
4694                ptr++;
4695                }
4696    
4697            recno = 0;            recno = 0;
4698            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4699              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4700    
4701              if (*ptr != terminator)
4702                {
4703                *errorcodeptr = ERR29;
4704                goto FAILED;
4705                }
4706    
4707              if (refsign == '-')
4708                {
4709                if (recno == 0)
4710                  {
4711                  *errorcodeptr = ERR58;
4712                  goto FAILED;
4713                  }
4714                recno = cd->bracount - recno + 1;
4715                if (recno <= 0)
4716                  {
4717                  *errorcodeptr = ERR15;
4718                  goto FAILED;
4719                  }
4720                }
4721              else if (refsign == '+')
4722                {
4723                if (recno == 0)
4724                  {
4725                  *errorcodeptr = ERR58;
4726                  goto FAILED;
4727                  }
4728                recno += cd->bracount;
4729                }
4730    
4731            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4732    
4733            HANDLE_RECURSION:            HANDLE_RECURSION:
4734    
4735            previous = code;            previous = code;
4736              called = cd->start_code;
4737    
4738            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4739            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4740              this point. If we end up with a forward reference, first check that
4741              the bracket does occur later so we can give the error (and position)
4742              now. Then remember this forward reference in the workspace so it can
4743              be filled in at the end. */
4744    
4745            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4746