/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 336 by ph10, Sat Apr 12 15:59:03 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 155  static const int posix_class_maps[] = { Line 213  static const int posix_class_maps[] = {
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named groups have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
287  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289      /* 50 */
290      "repeated subpattern is too long\0"    /** DEAD **/
291      "octal value is greater than \\377 (not in UTF-8 mode)\0"
292      "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 235  For convenience, we use the same bit def Line 322  For convenience, we use the same bit def
322    
323  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
326  static const unsigned char digitab[] =  static const unsigned char digitab[] =
327    {    {
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 358  static const unsigned char digitab[] =
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
362  static const unsigned char digitab[] =  static const unsigned char digitab[] =
363    {    {
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 372  static const unsigned char digitab[] =
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
378    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 406  static const unsigned char ebcdic_charta
406    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
407    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
408    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
409    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
410    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
411    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 433  static const unsigned char ebcdic_charta
433  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
434    
435  static BOOL  static BOOL
436    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
438    
439    
440    
441    /*************************************************
442    *            Find an error text                  *
443    *************************************************/
444    
445    /* The error texts are now all in one long string, to save on relocations. As
446    some of the text is of unknown length, we can't use a table of offsets.
447    Instead, just count through the strings. This is not a performance issue
448    because it happens only when there has been a compilation error.
449    
450    Argument:   the error number
451    Returns:    pointer to the error string
452    */
453    
454    static const char *
455    find_error_text(int n)
456    {
457    const char *s = error_texts;
458    for (; n > 0; n--) while (*s++ != 0);
459    return s;
460    }
461    
462    
463  /*************************************************  /*************************************************
# Line 357  static BOOL Line 466  static BOOL
466    
467  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
468  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
469  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
470  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472    ptr is pointing at the \. On exit, it is on the final character of the escape
473    sequence.
474    
475  Arguments:  Arguments:
476    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 370  Arguments: Line 481  Arguments:
481    
482  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
483                   negative => a special escape sequence                   negative => a special escape sequence
484                   on error, errorptr is set                   on error, errorcodeptr is set
485  */  */
486    
487  static int  static int
# Line 388  ptr--; /* Set Line 499  ptr--; /* Set
499    
500  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
501    
502  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
504  Otherwise further processing may be required. */  Otherwise further processing may be required. */
505    
506  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
507  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
509    
510  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
511  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
513  #endif  #endif
514    
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 517  else if ((i = escapes[c - 0x48]) != 0)
517  else  else
518    {    {
519    const uschar *oldptr;    const uschar *oldptr;
520      BOOL braced, negated;
521    
522    switch (c)    switch (c)
523      {      {
524      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535        /* \g must be followed by one of a number of specific things:
536    
537        (1) A number, either plain or braced. If positive, it is an absolute
538        backreference. If negative, it is a relative backreference. This is a Perl
539        5.10 feature.
540    
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551        case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560        if (ptr[1] == '{')
561          {
562          const uschar *p;
563          for (p = ptr+2; *p != 0 && *p != '}'; p++)
564            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565          if (*p != 0 && *p != '}')
566            {
567            c = -ESC_k;
568            break;
569            }
570          braced = TRUE;
571          ptr++;
572          }
573        else braced = FALSE;
574    
575        if (ptr[1] == '-')
576          {
577          negated = TRUE;
578          ptr++;
579          }
580        else negated = FALSE;
581    
582        c = 0;
583        while ((digitab[ptr[1]] & ctype_digit) != 0)
584          c = c * 10 + *(++ptr) - '0';
585    
586        if (c < 0)   /* Integer overflow */
587          {
588          *errorcodeptr = ERR61;
589          break;
590          }
591    
592        if (braced && *(++ptr) != '}')
593          {
594          *errorcodeptr = ERR57;
595          break;
596          }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604        if (negated)
605          {
606          if (c > bracount)
607            {
608            *errorcodeptr = ERR15;
609            break;
610            }
611          c = bracount - (c - 1);
612          }
613    
614        c = -(ESC_REF + c);
615        break;
616    
617      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
618      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
619      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 440  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638          if (c < 0)    /* Integer overflow */
639            {
640            *errorcodeptr = ERR61;
641            break;
642            }
643        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
644          {          {
645          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 460  else Line 660  else
660        }        }
661    
662      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
663      larger first octal digit. */      larger first octal digit. The original code used just to take the least
664        significant 8 bits of octal numbers (I think this is what early Perls used
665        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666        than 3 octal digits. */
667    
668      case '0':      case '0':
669      c -= '0';      c -= '0';
670      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
672      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
673      break;      break;
674    
675      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 689  else
689          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
690          count++;          count++;
691    
692  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
693          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
694          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
696          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
697          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698  #endif  #endif
# Line 513  else Line 716  else
716        {        {
717        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
718        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
719  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
720        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
721        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
723        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
724        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725  #endif  #endif
726        }        }
727      break;      break;
728    
729      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730        This coding is ASCII-specific, but then the whole concept of \cx is
731        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732    
733      case 'c':      case 'c':
734      c = *(++ptr);      c = *(++ptr);
735      if (c == 0)      if (c == 0)
736        {        {
737        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
738        return 0;        break;
739        }        }
740    
741      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
742      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
743      c ^= 0x40;      c ^= 0x40;
744  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
745      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
746      c ^= 0xC0;      c ^= 0xC0;
747  #endif  #endif
748      break;      break;
749    
750      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
752      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
753      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
754      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
755    
756      default:      default:
757      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 610  if (c == '{') Line 811  if (c == '{')
811      *negptr = TRUE;      *negptr = TRUE;
812      ptr++;      ptr++;
813      }      }
814    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
815      {      {
816      c = *(++ptr);      c = *(++ptr);
817      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 639  top = _pcre_utt_size; Line 840  top = _pcre_utt_size;
840  while (bot < top)  while (bot < top)
841    {    {
842    i = (bot + top) >> 1;    i = (bot + top) >> 1;
843    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844    if (c == 0)    if (c == 0)
845      {      {
846      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 763  return p; Line 964  return p;
964    
965    
966  /*************************************************  /*************************************************
967    *       Find forward referenced subpattern       *
968    *************************************************/
969    
970    /* This function scans along a pattern's text looking for capturing
971    subpatterns, and counting them. If it finds a named pattern that matches the
972    name it is given, it returns its number. Alternatively, if the name is NULL, it
973    returns when it reaches a given numbered subpattern. This is used for forward
974    references to subpatterns. We know that if (?P< is encountered, the name will
975    be terminated by '>' because that is checked in the first pass.
976    
977    Arguments:
978      ptr          current position in the pattern
979      count        current count of capturing parens so far encountered
980      name         name to seek, or NULL if seeking a numbered subpattern
981      lorn         name length, or subpattern number if name is NULL
982      xmode        TRUE if we are in /x mode
983    
984    Returns:       the number of the named subpattern, or -1 if not found
985    */
986    
987    static int
988    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
989      BOOL xmode)
990    {
991    const uschar *thisname;
992    
993    for (; *ptr != 0; ptr++)
994      {
995      int term;
996    
997      /* Skip over backslashed characters and also entire \Q...\E */
998    
999      if (*ptr == '\\')
1000        {
1001        if (*(++ptr) == 0) return -1;
1002        if (*ptr == 'Q') for (;;)
1003          {
1004          while (*(++ptr) != 0 && *ptr != '\\');
1005          if (*ptr == 0) return -1;
1006          if (*(++ptr) == 'E') break;
1007          }
1008        continue;
1009        }
1010    
1011      /* Skip over character classes */
1012    
1013      if (*ptr == '[')
1014        {
1015        while (*(++ptr) != ']')
1016          {
1017          if (*ptr == 0) return -1;
1018          if (*ptr == '\\')
1019            {
1020            if (*(++ptr) == 0) return -1;
1021            if (*ptr == 'Q') for (;;)
1022              {
1023              while (*(++ptr) != 0 && *ptr != '\\');
1024              if (*ptr == 0) return -1;
1025              if (*(++ptr) == 'E') break;
1026              }
1027            continue;
1028            }
1029          }
1030        continue;
1031        }
1032    
1033      /* Skip comments in /x mode */
1034    
1035      if (xmode && *ptr == '#')
1036        {
1037        while (*(++ptr) != 0 && *ptr != '\n');
1038        if (*ptr == 0) return -1;
1039        continue;
1040        }
1041    
1042      /* An opening parens must now be a real metacharacter */
1043    
1044      if (*ptr != '(') continue;
1045      if (ptr[1] != '?' && ptr[1] != '*')
1046        {
1047        count++;
1048        if (name == NULL && count == lorn) return count;
1049        continue;
1050        }
1051    
1052      ptr += 2;
1053      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1054    
1055      /* We have to disambiguate (?<! and (?<= from (?<name> */
1056    
1057      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1058           *ptr != '\'')
1059        continue;
1060    
1061      count++;
1062    
1063      if (name == NULL && count == lorn) return count;
1064      term = *ptr++;
1065      if (term == '<') term = '>';
1066      thisname = ptr;
1067      while (*ptr != term) ptr++;
1068      if (name != NULL && lorn == ptr - thisname &&
1069          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1070        return count;
1071      }
1072    
1073    return -1;
1074    }
1075    
1076    
1077    
1078    /*************************************************
1079  *      Find first significant op code            *  *      Find first significant op code            *
1080  *************************************************/  *************************************************/
1081    
# Line 811  for (;;) Line 1124  for (;;)
1124    
1125      case OP_CALLOUT:      case OP_CALLOUT:
1126      case OP_CREF:      case OP_CREF:
1127      case OP_BRANUMBER:      case OP_RREF:
1128        case OP_DEF:
1129      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1130      break;      break;
1131    
# Line 856  for (;;) Line 1170  for (;;)
1170    {    {
1171    int d;    int d;
1172    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1173    switch (op)    switch (op)
1174      {      {
1175        case OP_CBRA:
1176      case OP_BRA:      case OP_BRA:
1177      case OP_ONCE:      case OP_ONCE:
1178      case OP_COND:      case OP_COND:
1179      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1180      if (d < 0) return d;      if (d < 0) return d;
1181      branchlength += d;      branchlength += d;
1182      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1211  for (;;)
1211      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1212    
1213      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1214      case OP_CREF:      case OP_CREF:
1215        case OP_RREF:
1216        case OP_DEF:
1217      case OP_OPT:      case OP_OPT:
1218      case OP_CALLOUT:      case OP_CALLOUT:
1219      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1231  for (;;)
1231    
1232      case OP_CHAR:      case OP_CHAR:
1233      case OP_CHARNC:      case OP_CHARNC:
1234        case OP_NOT:
1235      branchlength++;      branchlength++;
1236      cc += 2;      cc += 2;
1237  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 943  for (;;) Line 1258  for (;;)
1258    
1259      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1260      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1261        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1262      cc += 4;      cc += 4;
1263      break;      break;
1264    
# Line 1031  Returns: pointer to the opcode for Line 1347  Returns: pointer to the opcode for
1347  static const uschar *  static const uschar *
1348  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1349  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1350  for (;;)  for (;;)
1351    {    {
1352    register int c = *code;    register int c = *code;
1353    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1354    else if (c > OP_BRA)  
1355      /* XCLASS is used for classes that cannot be represented just by a bit
1356      map. This includes negated single high-valued characters. The length in
1357      the table is zero; the actual length is stored in the compiled code. */
1358    
1359      if (c == OP_XCLASS) code += GET(code, 1);
1360    
1361      /* Handle capturing bracket */
1362    
1363      else if (c == OP_CBRA)
1364      {      {
1365      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1366      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1367      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1368      }      }
1369    
1370      /* Otherwise, we can get the item's length from the table, except that for
1371      repeated character types, we have to test for \p and \P, which have an extra
1372      two bytes of parameters. */
1373    
1374    else    else
1375      {      {
1376      code += _pcre_OP_lengths[c];      switch(c)
1377          {
1378          case OP_TYPESTAR:
1379          case OP_TYPEMINSTAR:
1380          case OP_TYPEPLUS:
1381          case OP_TYPEMINPLUS:
1382          case OP_TYPEQUERY:
1383          case OP_TYPEMINQUERY:
1384          case OP_TYPEPOSSTAR:
1385          case OP_TYPEPOSPLUS:
1386          case OP_TYPEPOSQUERY:
1387          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1388          break;
1389    
1390  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1391          case OP_TYPEMINUPTO:
1392          case OP_TYPEEXACT:
1393          case OP_TYPEPOSUPTO:
1394          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1395          break;
1396          }
1397    
1398      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1399      by a multi-byte character. The length in the table is a minimum, so we have  
1400      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
1401      can use relatively efficient code. */  
1402      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1403      a multi-byte character. The length in the table is a minimum, so we have to
1404      arrange to skip the extra bytes. */
1405    
1406    #ifdef SUPPORT_UTF8
1407      if (utf8) switch(c)      if (utf8) switch(c)
1408        {        {
1409        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1411  for (;;)
1411        case OP_EXACT:        case OP_EXACT:
1412        case OP_UPTO:        case OP_UPTO:
1413        case OP_MINUPTO:        case OP_MINUPTO:
1414          case OP_POSUPTO:
1415        case OP_STAR:        case OP_STAR:
1416        case OP_MINSTAR:        case OP_MINSTAR:
1417          case OP_POSSTAR:
1418        case OP_PLUS:        case OP_PLUS:
1419        case OP_MINPLUS:        case OP_MINPLUS:
1420          case OP_POSPLUS:
1421        case OP_QUERY:        case OP_QUERY:
1422        case OP_MINQUERY:        case OP_MINQUERY:
1423        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1424        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1425        break;        break;
1426        }        }
1427  #endif  #endif
# Line 1105  Returns: pointer to the opcode for Line 1448  Returns: pointer to the opcode for
1448  static const uschar *  static const uschar *
1449  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1450  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1451  for (;;)  for (;;)
1452    {    {
1453    register int c = *code;    register int c = *code;
1454    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1455    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1456    else if (c > OP_BRA)  
1457      {    /* XCLASS is used for classes that cannot be represented just by a bit
1458      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1459      }    the table is zero; the actual length is stored in the compiled code. */
1460    
1461      if (c == OP_XCLASS) code += GET(code, 1);
1462    
1463      /* Otherwise, we can get the item's length from the table, except that for
1464      repeated character types, we have to test for \p and \P, which have an extra
1465      two bytes of parameters. */
1466    
1467    else    else
1468      {      {
1469      code += _pcre_OP_lengths[c];      switch(c)
1470          {
1471          case OP_TYPESTAR:
1472          case OP_TYPEMINSTAR:
1473          case OP_TYPEPLUS:
1474          case OP_TYPEMINPLUS:
1475          case OP_TYPEQUERY:
1476          case OP_TYPEMINQUERY:
1477          case OP_TYPEPOSSTAR:
1478          case OP_TYPEPOSPLUS:
1479          case OP_TYPEPOSQUERY:
1480          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1481          break;
1482    
1483  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1484          case OP_TYPEUPTO:
1485          case OP_TYPEMINUPTO:
1486          case OP_TYPEEXACT:
1487          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1488          break;
1489          }
1490    
1491        /* Add in the fixed length from the table */
1492    
1493        code += _pcre_OP_lengths[c];
1494    
1495      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1496      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1497      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1498    
1499    #ifdef SUPPORT_UTF8
1500      if (utf8) switch(c)      if (utf8) switch(c)
1501        {        {
1502        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1504  for (;;)
1504        case OP_EXACT:        case OP_EXACT:
1505        case OP_UPTO:        case OP_UPTO:
1506        case OP_MINUPTO:        case OP_MINUPTO:
1507          case OP_POSUPTO:
1508        case OP_STAR:        case OP_STAR:
1509        case OP_MINSTAR:        case OP_MINSTAR:
1510          case OP_POSSTAR:
1511        case OP_PLUS:        case OP_PLUS:
1512        case OP_MINPLUS:        case OP_MINPLUS:
1513          case OP_POSPLUS:
1514        case OP_QUERY:        case OP_QUERY:
1515        case OP_MINQUERY:        case OP_MINQUERY:
1516        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1517        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1518        break;        break;
1519        }        }
1520  #endif  #endif
# Line 1165  for (;;) Line 1529  for (;;)
1529  *************************************************/  *************************************************/
1530    
1531  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1532  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1533  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1534  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1535  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1536    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1537    bracket whose current branch will already have been scanned.
1538    
1539  Arguments:  Arguments:
1540    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1548  static BOOL
1548  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1549  {  {
1550  register int c;  register int c;
1551  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1552       code < endcode;       code < endcode;
1553       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1554    {    {
# Line 1190  for (code = first_significant_code(code Line 1556  for (code = first_significant_code(code
1556    
1557    c = *code;    c = *code;
1558    
1559    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1560      first_significant_code() with a TRUE final argument. */
1561    
1562      if (c == OP_ASSERT)
1563        {
1564        do code += GET(code, 1); while (*code == OP_ALT);
1565        c = *code;
1566        continue;
1567        }
1568    
1569      /* Groups with zero repeats can of course be empty; skip them. */
1570    
1571      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572        {
1573        code += _pcre_OP_lengths[c];
1574        do code += GET(code, 1); while (*code == OP_ALT);
1575        c = *code;
1576        continue;
1577        }
1578    
1579      /* For other groups, scan the branches. */
1580    
1581      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1582      {      {
1583      BOOL empty_branch;      BOOL empty_branch;
1584      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1594  for (code = first_significant_code(code
1594        }        }
1595      while (*code == OP_ALT);      while (*code == OP_ALT);
1596      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1597      c = *code;      c = *code;
1598        continue;
1599      }      }
1600    
1601    else switch (c)    /* Handle the other opcodes */
1602    
1603      switch (c)
1604      {      {
1605      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1606        cannot be represented just by a bit map. This includes negated single
1607        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1608        actual length is stored in the compiled code, so we must update "code"
1609        here. */
1610    
1611  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1612      case OP_XCLASS:      case OP_XCLASS:
1613      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1614      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1615  #endif  #endif
1616    
# Line 1266  for (code = first_significant_code(code Line 1660  for (code = first_significant_code(code
1660      case OP_NOT:      case OP_NOT:
1661      case OP_PLUS:      case OP_PLUS:
1662      case OP_MINPLUS:      case OP_MINPLUS:
1663        case OP_POSPLUS:
1664      case OP_EXACT:      case OP_EXACT:
1665      case OP_NOTPLUS:      case OP_NOTPLUS:
1666      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1667        case OP_NOTPOSPLUS:
1668      case OP_NOTEXACT:      case OP_NOTEXACT:
1669      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1670      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1671        case OP_TYPEPOSPLUS:
1672      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1673      return FALSE;      return FALSE;
1674    
1675        /* These are going to continue, as they may be empty, but we have to
1676        fudge the length for the \p and \P cases. */
1677    
1678        case OP_TYPESTAR:
1679        case OP_TYPEMINSTAR:
1680        case OP_TYPEPOSSTAR:
1681        case OP_TYPEQUERY:
1682        case OP_TYPEMINQUERY:
1683        case OP_TYPEPOSQUERY:
1684        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1685        break;
1686    
1687        /* Same for these */
1688    
1689        case OP_TYPEUPTO:
1690        case OP_TYPEMINUPTO:
1691        case OP_TYPEPOSUPTO:
1692        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1693        break;
1694    
1695      /* End of branch */      /* End of branch */
1696    
1697      case OP_KET:      case OP_KET:
# Line 1283  for (code = first_significant_code(code Line 1700  for (code = first_significant_code(code
1700      case OP_ALT:      case OP_ALT:
1701      return TRUE;      return TRUE;
1702    
1703      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1704      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1705    
1706  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1707      case OP_STAR:      case OP_STAR:
1708      case OP_MINSTAR:      case OP_MINSTAR:
1709        case OP_POSSTAR:
1710      case OP_QUERY:      case OP_QUERY:
1711      case OP_MINQUERY:      case OP_MINQUERY:
1712        case OP_POSQUERY:
1713      case OP_UPTO:      case OP_UPTO:
1714      case OP_MINUPTO:      case OP_MINUPTO:
1715        case OP_POSUPTO:
1716      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1717      break;      break;
1718  #endif  #endif
# Line 1341  return TRUE; Line 1761  return TRUE;
1761  *************************************************/  *************************************************/
1762    
1763  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1764  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1765  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1766  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1767    
1768    Originally, this function only recognized a sequence of letters between the
1769    terminators, but it seems that Perl recognizes any sequence of characters,
1770    though of course unknown POSIX names are subsequently rejected. Perl gives an
1771    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1772    didn't consider this to be a POSIX class. Likewise for [:1234:].
1773    
1774    The problem in trying to be exactly like Perl is in the handling of escapes. We
1775    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1776    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1777    below handles the special case of \], but does not try to do any other escape
1778    processing. This makes it different from Perl for cases such as [:l\ower:]
1779    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1780    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1781    I think.
1782    
1783  Argument:  Arguments:
1784    ptr      pointer to the initial [    ptr      pointer to the initial [
1785    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1786    
1787  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1788  */  */
1789    
1790  static BOOL  static BOOL
1791  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1792  {  {
1793  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1794  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1795  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1796    {    {
1797    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1798    return TRUE;      {
1799        if (*ptr == ']') return FALSE;
1800        if (*ptr == terminator && ptr[1] == ']')
1801          {
1802          *endptr = ptr;
1803          return TRUE;
1804          }
1805        }
1806    }    }
1807  return FALSE;  return FALSE;
1808  }  }
# Line 1388  Returns: a value representing the na Line 1827  Returns: a value representing the na
1827  static int  static int
1828  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1829  {  {
1830    const char *pn = posix_names;
1831  register int yield = 0;  register int yield = 0;
1832  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1833    {    {
1834    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1835      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1836      pn += posix_name_lengths[yield] + 1;
1837    yield++;    yield++;
1838    }    }
1839  return -1;  return -1;
# Line 1407  return -1; Line 1848  return -1;
1848  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1849  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1850  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1851  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1853  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1854  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1855  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1856    OP_END.
1857    
1858    This function has been extended with the possibility of forward references for
1859    recursions and subroutine calls. It must also check the list of such references
1860    for the group we are dealing with. If it finds that one of the recursions in
1861    the current group is on this list, it adjusts the offset in the list, not the
1862    value in the reference (which is a group number).
1863    
1864  Arguments:  Arguments:
1865    group      points to the start of the group    group      points to the start of the group
1866    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1867    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1868    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1869      save_hwm   the hwm forward reference pointer at the start of the group
1870    
1871  Returns:     nothing  Returns:     nothing
1872  */  */
1873    
1874  static void  static void
1875  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1876      uschar *save_hwm)
1877  {  {
1878  uschar *ptr = group;  uschar *ptr = group;
1879    
1880  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1881    {    {
1882    int offset = GET(ptr, 1);    int offset;
1883    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1884    
1885      /* See if this recursion is on the forward reference list. If so, adjust the
1886      reference. */
1887    
1888      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889        {
1890        offset = GET(hc, 0);
1891        if (cd->start_code + offset == ptr + 1)
1892          {
1893          PUT(hc, 0, offset + adjust);
1894          break;
1895          }
1896        }
1897    
1898      /* Otherwise, adjust the recursion offset if it's after the start of this
1899      group. */
1900    
1901      if (hc >= cd->hwm)
1902        {
1903        offset = GET(ptr, 1);
1904        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1905        }
1906    
1907    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1908    }    }
1909  }  }
# Line 1508  Yield: TRUE when range returned; Line 1982  Yield: TRUE when range returned;
1982  */  */
1983    
1984  static BOOL  static BOOL
1985  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1986      unsigned int *odptr)
1987  {  {
1988  int c, othercase, next;  unsigned int c, othercase, next;
1989    
1990  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1991    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1992    
1993  if (c > d) return FALSE;  if (c > d) return FALSE;
1994    
# Line 1534  return TRUE; Line 2009  return TRUE;
2009  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2010    
2011    
2012    
2013  /*************************************************  /*************************************************
2014  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2015  *************************************************/  *************************************************/
2016    
2017  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2018  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2019  bits.  sense to automatically possessify the repeated item.
2020    
2021  Arguments:  Arguments:
2022    optionsptr     pointer to the option bits    op_code       the repeated op code
2023    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2024    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2025    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2026    errorcodeptr   points to error code variable    ptr           next character in pattern
2027    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2028    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
2029    
2030  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
2031  */  */
2032    
2033  static BOOL  static BOOL
2034  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2035    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
2036  {  {
2037  int repeat_type, op_type;  int next;
 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
 int bravalue = 0;  
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
2038    
2039  #ifdef SUPPORT_UTF8  /* Skip whitespace and comments in extended mode */
 BOOL class_utf8;  
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
 #endif  
2040    
2041  /* Set up the default and non-default settings for greediness */  if ((options & PCRE_EXTENDED) != 0)
2042      {
2043      for (;;)
2044        {
2045        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2046        if (*ptr == '#')
2047          {
2048          while (*(++ptr) != 0)
2049            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2050          }
2051        else break;
2052        }
2053      }
2054    
2055    /* If the next item is one that we can handle, get its value. A non-negative
2056    value is a character, a negative value is an escape value. */
2057    
2058    if (*ptr == '\\')
2059      {
2060      int temperrorcode = 0;
2061      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2062      if (temperrorcode != 0) return FALSE;
2063      ptr++;    /* Point after the escape sequence */
2064      }
2065    
2066    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2067      {
2068    #ifdef SUPPORT_UTF8
2069      if (utf8) { GETCHARINC(next, ptr); } else
2070    #endif
2071      next = *ptr++;
2072      }
2073    
2074    else return FALSE;
2075    
2076    /* Skip whitespace and comments in extended mode */
2077    
2078    if ((options & PCRE_EXTENDED) != 0)
2079      {
2080      for (;;)
2081        {
2082        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2083        if (*ptr == '#')
2084          {
2085          while (*(++ptr) != 0)
2086            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2087          }
2088        else break;
2089        }
2090      }
2091    
2092    /* If the next thing is itself optional, we have to give up. */
2093    
2094    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2095      return FALSE;
2096    
2097    /* Now compare the next item with the previous opcode. If the previous is a
2098    positive single character match, "item" either contains the character or, if
2099    "item" is greater than 127 in utf8 mode, the character's bytes are in
2100    utf8_char. */
2101    
2102    
2103    /* Handle cases when the next item is a character. */
2104    
2105    if (next >= 0) switch(op_code)
2106      {
2107      case OP_CHAR:
2108    #ifdef SUPPORT_UTF8
2109      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2110    #endif
2111      return item != next;
2112    
2113      /* For CHARNC (caseless character) we must check the other case. If we have
2114      Unicode property support, we can use it to test the other case of
2115      high-valued characters. */
2116    
2117      case OP_CHARNC:
2118    #ifdef SUPPORT_UTF8
2119      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2120    #endif
2121      if (item == next) return FALSE;
2122    #ifdef SUPPORT_UTF8
2123      if (utf8)
2124        {
2125        unsigned int othercase;
2126        if (next < 128) othercase = cd->fcc[next]; else
2127    #ifdef SUPPORT_UCP
2128        othercase = _pcre_ucp_othercase((unsigned int)next);
2129    #else
2130        othercase = NOTACHAR;
2131    #endif
2132        return (unsigned int)item != othercase;
2133        }
2134      else
2135    #endif  /* SUPPORT_UTF8 */
2136      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2137    
2138      /* For OP_NOT, "item" must be a single-byte character. */
2139    
2140      case OP_NOT:
2141      if (item == next) return TRUE;
2142      if ((options & PCRE_CASELESS) == 0) return FALSE;
2143    #ifdef SUPPORT_UTF8
2144      if (utf8)
2145        {
2146        unsigned int othercase;
2147        if (next < 128) othercase = cd->fcc[next]; else
2148    #ifdef SUPPORT_UCP
2149        othercase = _pcre_ucp_othercase(next);
2150    #else
2151        othercase = NOTACHAR;
2152    #endif
2153        return (unsigned int)item == othercase;
2154        }
2155      else
2156    #endif  /* SUPPORT_UTF8 */
2157      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2158    
2159      case OP_DIGIT:
2160      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2161    
2162      case OP_NOT_DIGIT:
2163      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2164    
2165      case OP_WHITESPACE:
2166      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2167    
2168      case OP_NOT_WHITESPACE:
2169      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2170    
2171      case OP_WORDCHAR:
2172      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2173    
2174      case OP_NOT_WORDCHAR:
2175      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2176    
2177      case OP_HSPACE:
2178      case OP_NOT_HSPACE:
2179      switch(next)
2180        {
2181        case 0x09:
2182        case 0x20:
2183        case 0xa0:
2184        case 0x1680:
2185        case 0x180e:
2186        case 0x2000:
2187        case 0x2001:
2188        case 0x2002:
2189        case 0x2003:
2190        case 0x2004:
2191        case 0x2005:
2192        case 0x2006:
2193        case 0x2007:
2194        case 0x2008:
2195        case 0x2009:
2196        case 0x200A:
2197        case 0x202f:
2198        case 0x205f:
2199        case 0x3000:
2200        return op_code != OP_HSPACE;
2201        default:
2202        return op_code == OP_HSPACE;
2203        }
2204    
2205      case OP_VSPACE:
2206      case OP_NOT_VSPACE:
2207      switch(next)
2208        {
2209        case 0x0a:
2210        case 0x0b:
2211        case 0x0c:
2212        case 0x0d:
2213        case 0x85:
2214        case 0x2028:
2215        case 0x2029:
2216        return op_code != OP_VSPACE;
2217        default:
2218        return op_code == OP_VSPACE;
2219        }
2220    
2221      default:
2222      return FALSE;
2223      }
2224    
2225    
2226    /* Handle the case when the next item is \d, \s, etc. */
2227    
2228    switch(op_code)
2229      {
2230      case OP_CHAR:
2231      case OP_CHARNC:
2232    #ifdef SUPPORT_UTF8
2233      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2234    #endif
2235      switch(-next)
2236        {
2237        case ESC_d:
2238        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2239    
2240        case ESC_D:
2241        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2242    
2243        case ESC_s:
2244        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2245    
2246        case ESC_S:
2247        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2248    
2249        case ESC_w:
2250        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2251    
2252        case ESC_W:
2253        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2254    
2255        case ESC_h:
2256        case ESC_H:
2257        switch(item)
2258          {
2259          case 0x09:
2260          case 0x20:
2261          case 0xa0:
2262          case 0x1680:
2263          case 0x180e:
2264          case 0x2000:
2265          case 0x2001:
2266          case 0x2002:
2267          case 0x2003:
2268          case 0x2004:
2269          case 0x2005:
2270          case 0x2006:
2271          case 0x2007:
2272          case 0x2008:
2273          case 0x2009:
2274          case 0x200A:
2275          case 0x202f:
2276          case 0x205f:
2277          case 0x3000:
2278          return -next != ESC_h;
2279          default:
2280          return -next == ESC_h;
2281          }
2282    
2283        case ESC_v:
2284        case ESC_V:
2285        switch(item)
2286          {
2287          case 0x0a:
2288          case 0x0b:
2289          case 0x0c:
2290          case 0x0d:
2291          case 0x85:
2292          case 0x2028:
2293          case 0x2029:
2294          return -next != ESC_v;
2295          default:
2296          return -next == ESC_v;
2297          }
2298    
2299        default:
2300        return FALSE;
2301        }
2302    
2303      case OP_DIGIT:
2304      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2305             next == -ESC_h || next == -ESC_v;
2306    
2307      case OP_NOT_DIGIT:
2308      return next == -ESC_d;
2309    
2310      case OP_WHITESPACE:
2311      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2312    
2313      case OP_NOT_WHITESPACE:
2314      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2315    
2316      case OP_HSPACE:
2317      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2318    
2319      case OP_NOT_HSPACE:
2320      return next == -ESC_h;
2321    
2322      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2323      case OP_VSPACE:
2324      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2325    
2326      case OP_NOT_VSPACE:
2327      return next == -ESC_v;
2328    
2329      case OP_WORDCHAR:
2330      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2331    
2332      case OP_NOT_WORDCHAR:
2333      return next == -ESC_w || next == -ESC_d;
2334    
2335      default:
2336      return FALSE;
2337      }
2338    
2339    /* Control does not reach here */
2340    }
2341    
2342    
2343    
2344    /*************************************************
2345    *           Compile one branch                   *
2346    *************************************************/
2347    
2348    /* Scan the pattern, compiling it into the a vector. If the options are
2349    changed during the branch, the pointer is used to change the external options
2350    bits. This function is used during the pre-compile phase when we are trying
2351    to find out the amount of memory needed, as well as during the real compile
2352    phase. The value of lengthptr distinguishes the two phases.
2353    
2354    Arguments:
2355      optionsptr     pointer to the option bits
2356      codeptr        points to the pointer to the current code point
2357      ptrptr         points to the current pattern pointer
2358      errorcodeptr   points to error code variable
2359      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2360      reqbyteptr     set to the last literal character required, else < 0
2361      bcptr          points to current branch chain
2362      cd             contains pointers to tables etc.
2363      lengthptr      NULL during the real compile phase
2364                     points to length accumulator during pre-compile phase
2365    
2366    Returns:         TRUE on success
2367                     FALSE, with *errorcodeptr set non-zero on error
2368    */
2369    
2370    static BOOL
2371    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2372      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2373      compile_data *cd, int *lengthptr)
2374    {
2375    int repeat_type, op_type;
2376    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2377    int bravalue = 0;
2378    int greedy_default, greedy_non_default;
2379    int firstbyte, reqbyte;
2380    int zeroreqbyte, zerofirstbyte;
2381    int req_caseopt, reqvary, tempreqvary;
2382    int options = *optionsptr;
2383    int after_manual_callout = 0;
2384    int length_prevgroup = 0;
2385    register int c;
2386    register uschar *code = *codeptr;
2387    uschar *last_code = code;
2388    uschar *orig_code = code;
2389    uschar *tempcode;
2390    BOOL inescq = FALSE;
2391    BOOL groupsetfirstbyte = FALSE;
2392    const uschar *ptr = *ptrptr;
2393    const uschar *tempptr;
2394    uschar *previous = NULL;
2395    uschar *previous_callout = NULL;
2396    uschar *save_hwm = NULL;
2397    uschar classbits[32];
2398    
2399    #ifdef SUPPORT_UTF8
2400    BOOL class_utf8;
2401    BOOL utf8 = (options & PCRE_UTF8) != 0;
2402    uschar *class_utf8data;
2403    uschar *class_utf8data_base;
2404    uschar utf8_char[6];
2405    #else
2406    BOOL utf8 = FALSE;
2407    uschar *utf8_char = NULL;
2408    #endif
2409    
2410    #ifdef DEBUG
2411    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2412    #endif
2413    
2414    /* Set up the default and non-default settings for greediness */
2415    
2416  greedy_default = ((options & PCRE_UNGREEDY) != 0);  greedy_default = ((options & PCRE_UNGREEDY) != 0);
2417  greedy_non_default = greedy_default ^ 1;  greedy_non_default = greedy_default ^ 1;
# Line 1621  req_caseopt = ((options & PCRE_CASELESS) Line 2440  req_caseopt = ((options & PCRE_CASELESS)
2440  for (;; ptr++)  for (;; ptr++)
2441    {    {
2442    BOOL negate_class;    BOOL negate_class;
2443      BOOL should_flip_negation;
2444    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2445    BOOL is_quantifier;    BOOL is_quantifier;
2446      BOOL is_recurse;
2447      BOOL reset_bracount;
2448    int class_charcount;    int class_charcount;
2449    int class_lastchar;    int class_lastchar;
2450    int newoptions;    int newoptions;
2451    int recno;    int recno;
2452      int refsign;
2453    int skipbytes;    int skipbytes;
2454    int subreqbyte;    int subreqbyte;
2455    int subfirstbyte;    int subfirstbyte;
2456      int terminator;
2457    int mclength;    int mclength;
2458    uschar mcbuffer[8];    uschar mcbuffer[8];
2459    
2460    /* Next byte in the pattern */    /* Get next byte in the pattern */
2461    
2462    c = *ptr;    c = *ptr;
2463    
2464      /* If we are in the pre-compile phase, accumulate the length used for the
2465      previous cycle of this loop. */
2466    
2467      if (lengthptr != NULL)
2468        {
2469    #ifdef DEBUG
2470        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2471    #endif
2472        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2473          {
2474          *errorcodeptr = ERR52;
2475          goto FAILED;
2476          }
2477    
2478        /* There is at least one situation where code goes backwards: this is the
2479        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2480        the class is simply eliminated. However, it is created first, so we have to
2481        allow memory for it. Therefore, don't ever reduce the length at this point.
2482        */
2483    
2484        if (code < last_code) code = last_code;
2485    
2486        /* Paranoid check for integer overflow */
2487    
2488        if (OFLOW_MAX - *lengthptr < code - last_code)
2489          {
2490          *errorcodeptr = ERR20;
2491          goto FAILED;
2492          }
2493    
2494        *lengthptr += code - last_code;
2495        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2496    
2497        /* If "previous" is set and it is not at the start of the work space, move
2498        it back to there, in order to avoid filling up the work space. Otherwise,
2499        if "previous" is NULL, reset the current code pointer to the start. */
2500    
2501        if (previous != NULL)
2502          {
2503          if (previous > orig_code)
2504            {
2505            memmove(orig_code, previous, code - previous);
2506            code -= previous - orig_code;
2507            previous = orig_code;
2508            }
2509          }
2510        else code = orig_code;
2511    
2512        /* Remember where this code item starts so we can pick up the length
2513        next time round. */
2514    
2515        last_code = code;
2516        }
2517    
2518      /* In the real compile phase, just check the workspace used by the forward
2519      reference list. */
2520    
2521      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2522        {
2523        *errorcodeptr = ERR52;
2524        goto FAILED;
2525        }
2526    
2527    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2528    
# Line 1651  for (;; ptr++) Line 2538  for (;; ptr++)
2538        {        {
2539        if (previous_callout != NULL)        if (previous_callout != NULL)
2540          {          {
2541          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2542              complete_callout(previous_callout, ptr, cd);
2543          previous_callout = NULL;          previous_callout = NULL;
2544          }          }
2545        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2560  for (;; ptr++)
2560    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2561         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2562      {      {
2563      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2564          complete_callout(previous_callout, ptr, cd);
2565      previous_callout = NULL;      previous_callout = NULL;
2566      }      }
2567    
# Line 1683  for (;; ptr++) Line 2572  for (;; ptr++)
2572      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2573      if (c == '#')      if (c == '#')
2574        {        {
2575        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2576        on the Macintosh. */          {
2577        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2578        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2579          if (*ptr != 0) continue;
2580    
2581          /* Else fall through to handle end of string */
2582          c = 0;
2583        }        }
2584      }      }
2585    
# Line 1700  for (;; ptr++) Line 2593  for (;; ptr++)
2593    
2594    switch(c)    switch(c)
2595      {      {
2596      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2597        case 0:                        /* The branch terminates at string end */
2598      case 0:      case '|':                      /* or | or ) */
     case '|':  
2599      case ')':      case ')':
2600      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2601      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2602      *codeptr = code;      *codeptr = code;
2603      *ptrptr = ptr;      *ptrptr = ptr;
2604        if (lengthptr != NULL)
2605          {
2606          if (OFLOW_MAX - *lengthptr < code - last_code)
2607            {
2608            *errorcodeptr = ERR20;
2609            goto FAILED;
2610            }
2611          *lengthptr += code - last_code;   /* To include callout length */
2612          DPRINTF((">> end branch\n"));
2613          }
2614      return TRUE;      return TRUE;
2615    
2616    
2617        /* ===================================================================*/
2618      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2619      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2620    
# Line 1739  for (;; ptr++) Line 2643  for (;; ptr++)
2643      *code++ = OP_ANY;      *code++ = OP_ANY;
2644      break;      break;
2645    
2646    
2647        /* ===================================================================*/
2648      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2649      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2650      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1749  for (;; ptr++) Line 2655  for (;; ptr++)
2655      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2656      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2657      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2658      */  
2659        In JavaScript compatibility mode, an isolated ']' causes an error. In
2660        default (Perl) mode, it is treated as a data character. */
2661    
2662        case ']':
2663        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664          {
2665          *errorcodeptr = ERR64;
2666          goto FAILED;
2667          }
2668        goto NORMAL_CHAR;
2669    
2670      case '[':      case '[':
2671      previous = code;      previous = code;
# Line 1758  for (;; ptr++) Line 2674  for (;; ptr++)
2674      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2675    
2676      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2677          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2678        {        {
2679        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2680        goto FAILED;        goto FAILED;
2681        }        }
2682    
2683      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2684        if the first few characters (either before or after ^) are \Q\E or \E we
2685        skip them too. This makes for compatibility with Perl. */
2686    
2687      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2688        for (;;)
2689        {        {
       negate_class = TRUE;  
2690        c = *(++ptr);        c = *(++ptr);
2691          if (c == '\\')
2692            {
2693            if (ptr[1] == 'E') ptr++;
2694              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2695                else break;
2696            }
2697          else if (!negate_class && c == '^')
2698            negate_class = TRUE;
2699          else break;
2700        }        }
2701      else  
2702        {      /* If a class contains a negative special such as \S, we need to flip the
2703        negate_class = FALSE;      negation flag at the end, so that support for characters > 255 works
2704        }      correctly (they are all included in the class). */
2705    
2706        should_flip_negation = FALSE;
2707    
2708      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2709      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2710      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2711    
2712      class_charcount = 0;      class_charcount = 0;
2713      class_lastchar = -1;      class_lastchar = -1;
2714    
2715        /* Initialize the 32-char bit map to all zeros. We build the map in a
2716        temporary bit of memory, in case the class contains only 1 character (less
2717        than 256), because in that case the compiled code doesn't use the bit map.
2718        */
2719    
2720        memset(classbits, 0, 32 * sizeof(uschar));
2721    
2722  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2723      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2724      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2725        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2726  #endif  #endif
2727    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2728      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2729      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2730      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2731    
2732      do      if (c != 0) do
2733        {        {
2734          const uschar *oldptr;
2735    
2736  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2737        if (utf8 && c > 127)        if (utf8 && c > 127)
2738          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2739          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2740          }          }
2741    
2742          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743          data and reset the pointer. This is so that very large classes that
2744          contain a zillion UTF-8 characters no longer overwrite the work space
2745          (which is on the stack). */
2746    
2747          if (lengthptr != NULL)
2748            {
2749            *lengthptr += class_utf8data - class_utf8data_base;
2750            class_utf8data = class_utf8data_base;
2751            }
2752    
2753  #endif  #endif
2754    
2755        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2756    
2757        if (inescq)        if (inescq)
2758          {          {
2759          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2760            {            {
2761            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2762            ptr++;            ptr++;                            /* Skip the 'E' */
2763            continue;            continue;                         /* Carry on with next */
2764            }            }
2765          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2766          }          }
2767    
2768        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1831  for (;; ptr++) Line 2773  for (;; ptr++)
2773    
2774        if (c == '[' &&        if (c == '[' &&
2775            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2776            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2777          {          {
2778          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2779          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 1848  for (;; ptr++) Line 2790  for (;; ptr++)
2790          if (*ptr == '^')          if (*ptr == '^')
2791            {            {
2792            local_negate = TRUE;            local_negate = TRUE;
2793              should_flip_negation = TRUE;  /* Note negative special */
2794            ptr++;            ptr++;
2795            }            }
2796    
# Line 1911  for (;; ptr++) Line 2854  for (;; ptr++)
2854          }          }
2855    
2856        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2857        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2858        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2859        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2860        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2861        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2862    
2863        if (c == '\\')        if (c == '\\')
2864          {          {
2865          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2866            if (*errorcodeptr != 0) goto FAILED;
2867    
2868          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2869          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2870            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2871          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2872            {            {
2873            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1933  for (;; ptr++) Line 2877  for (;; ptr++)
2877            else inescq = TRUE;            else inescq = TRUE;
2878            continue;            continue;
2879            }            }
2880            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2881    
2882          if (c < 0)          if (c < 0)
2883            {            {
2884            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2885            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2886            switch (-c)  
2887              /* Save time by not doing this in the pre-compile phase. */
2888    
2889              if (lengthptr == NULL) switch (-c)
2890              {              {
2891              case ESC_d:              case ESC_d:
2892              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2893              continue;              continue;
2894    
2895              case ESC_D:              case ESC_D:
2896                should_flip_negation = TRUE;
2897              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2898              continue;              continue;
2899    
# Line 1953  for (;; ptr++) Line 2902  for (;; ptr++)
2902              continue;              continue;
2903    
2904              case ESC_W:              case ESC_W:
2905                should_flip_negation = TRUE;
2906              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2907              continue;              continue;
2908    
# Line 1962  for (;; ptr++) Line 2912  for (;; ptr++)
2912              continue;              continue;
2913    
2914              case ESC_S:              case ESC_S:
2915                should_flip_negation = TRUE;
2916              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2917              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2918              continue;              continue;
2919    
2920  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2921              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2922              case ESC_P:              }
2923    
2924              /* In the pre-compile phase, just do the recognition. */
2925    
2926              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2927                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2928    
2929              /* We need to deal with \H, \h, \V, and \v in both phases because
2930              they use extra memory. */
2931    
2932              if (-c == ESC_h)
2933                {
2934                SETBIT(classbits, 0x09); /* VT */
2935                SETBIT(classbits, 0x20); /* SPACE */
2936                SETBIT(classbits, 0xa0); /* NSBP */
2937    #ifdef SUPPORT_UTF8
2938                if (utf8)
2939                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2940                class_utf8 = TRUE;                class_utf8 = TRUE;
2941                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2942                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2943                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2944                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2945                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2946                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2947                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2948                  *class_utf8data++ = XCL_SINGLE;
2949                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2950                  *class_utf8data++ = XCL_SINGLE;
2951                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2952                  *class_utf8data++ = XCL_SINGLE;
2953                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2954                }                }
             continue;  
2955  #endif  #endif
2956                continue;
2957                }
2958    
2959              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2960              strict mode. By default, for compatibility with Perl, they are              {
2961              treated as literals. */              for (c = 0; c < 32; c++)
2962                  {
2963                  int x = 0xff;
2964                  switch (c)
2965                    {
2966                    case 0x09/8: x ^= 1 << (0x09%8); break;
2967                    case 0x20/8: x ^= 1 << (0x20%8); break;
2968                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2969                    default: break;
2970                    }
2971                  classbits[c] |= x;
2972                  }
2973    
2974              default:  #ifdef SUPPORT_UTF8
2975              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2976                {                {
2977                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2978                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2979                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2980                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2981                  *class_utf8data++ = XCL_RANGE;
2982                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2983                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2984                  *class_utf8data++ = XCL_RANGE;
2985                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2986                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2987                  *class_utf8data++ = XCL_RANGE;
2988                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2989                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2990                  *class_utf8data++ = XCL_RANGE;
2991                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2992                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2993                  *class_utf8data++ = XCL_RANGE;
2994                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2995                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2996                  *class_utf8data++ = XCL_RANGE;
2997                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2998                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2999                }                }
3000              c = *ptr;              /* The final character */  #endif
3001              class_charcount -= 2;  /* Undo the default count from above */              continue;
3002              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
3003    
3004          }   /* End of backslash handling */            if (-c == ESC_v)
3005                {
3006        /* A single character may be followed by '-' to form a range. However,              SETBIT(classbits, 0x0a); /* LF */
3007        Perl does not permit ']' to be the end of the range. A '-' character              SETBIT(classbits, 0x0b); /* VT */
3008        here is treated as a literal. */              SETBIT(classbits, 0x0c); /* FF */
3009                SETBIT(classbits, 0x0d); /* CR */
3010                SETBIT(classbits, 0x85); /* NEL */
3011    #ifdef SUPPORT_UTF8
3012                if (utf8)
3013                  {
3014                  class_utf8 = TRUE;
3015                  *class_utf8data++ = XCL_RANGE;
3016                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3017                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3018                  }
3019    #endif
3020                continue;
3021                }
3022    
3023              if (-c == ESC_V)
3024                {
3025                for (c = 0; c < 32; c++)
3026                  {
3027                  int x = 0xff;
3028                  switch (c)
3029                    {
3030                    case 0x0a/8: x ^= 1 << (0x0a%8);
3031                                 x ^= 1 << (0x0b%8);
3032                                 x ^= 1 << (0x0c%8);
3033                                 x ^= 1 << (0x0d%8);
3034                                 break;
3035                    case 0x85/8: x ^= 1 << (0x85%8); break;
3036                    default: break;
3037                    }
3038                  classbits[c] |= x;
3039                  }
3040    
3041    #ifdef SUPPORT_UTF8
3042                if (utf8)
3043                  {
3044                  class_utf8 = TRUE;
3045                  *class_utf8data++ = XCL_RANGE;
3046                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3047                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3048                  *class_utf8data++ = XCL_RANGE;
3049                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3050                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3051                  }
3052    #endif
3053                continue;
3054                }
3055    
3056              /* We need to deal with \P and \p in both phases. */
3057    
3058    #ifdef SUPPORT_UCP
3059              if (-c == ESC_p || -c == ESC_P)
3060                {
3061                BOOL negated;
3062                int pdata;
3063                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3064                if (ptype < 0) goto FAILED;
3065                class_utf8 = TRUE;
3066                *class_utf8data++ = ((-c == ESC_p) != negated)?
3067                  XCL_PROP : XCL_NOTPROP;
3068                *class_utf8data++ = ptype;
3069                *class_utf8data++ = pdata;
3070                class_charcount -= 2;   /* Not a < 256 character */
3071                continue;
3072                }
3073    #endif
3074              /* Unrecognized escapes are faulted if PCRE is running in its
3075              strict mode. By default, for compatibility with Perl, they are
3076              treated as literals. */
3077    
3078        if (ptr[1] == '-' && ptr[2] != ']')            if ((options & PCRE_EXTRA) != 0)
3079                {
3080                *errorcodeptr = ERR7;
3081                goto FAILED;
3082                }
3083    
3084              class_charcount -= 2;  /* Undo the default count from above */
3085              c = *ptr;              /* Get the final character and fall through */
3086              }
3087    
3088            /* Fall through if we have a single character (c >= 0). This may be
3089            greater than 256 in UTF-8 mode. */
3090    
3091            }   /* End of backslash handling */
3092    
3093          /* A single character may be followed by '-' to form a range. However,
3094          Perl does not permit ']' to be the end of the range. A '-' character
3095          at the end is treated as a literal. Perl ignores orphaned \E sequences
3096          entirely. The code for handling \Q and \E is messy. */
3097    
3098          CHECK_RANGE:
3099          while (ptr[1] == '\\' && ptr[2] == 'E')
3100            {
3101            inescq = FALSE;
3102            ptr += 2;
3103            }
3104    
3105          oldptr = ptr;
3106    
3107          /* Remember \r or \n */
3108    
3109          if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3110    
3111          /* Check for range */
3112    
3113          if (!inescq && ptr[1] == '-')
3114          {          {
3115          int d;          int d;
3116          ptr += 2;          ptr += 2;
3117            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3118    
3119            /* If we hit \Q (not followed by \E) at this point, go into escaped
3120            mode. */
3121    
3122            while (*ptr == '\\' && ptr[1] == 'Q')
3123              {
3124              ptr += 2;
3125              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3126              inescq = TRUE;
3127              break;
3128              }
3129    
3130            if (*ptr == 0 || (!inescq && *ptr == ']'))
3131              {
3132              ptr = oldptr;
3133              goto LONE_SINGLE_CHARACTER;
3134              }
3135    
3136  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3137          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 3146  for (;; ptr++)
3146          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3147          in such circumstances. */          in such circumstances. */
3148    
3149          if (d == '\\')          if (!inescq && d == '\\')
3150            {            {
3151            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3152            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3153    
3154            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3155            was literal */            special means the '-' was literal */
3156    
3157            if (d < 0)            if (d < 0)
3158              {              {
3159              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3160              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3161                else if (d == -ESC_R) d = 'R'; else
3162                {                {
3163                ptr = oldptr - 2;                ptr = oldptr;
3164                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3165                }                }
3166              }              }
3167            }            }
3168    
3169          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3170          the pre-pass. Optimize one-character ranges */          one-character ranges */
3171    
3172            if (d < c)
3173              {
3174              *errorcodeptr = ERR8;
3175              goto FAILED;
3176              }
3177    
3178          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3179    
3180            /* Remember \r or \n */
3181    
3182            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3183    
3184          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3185          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3186          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2067  for (;; ptr++) Line 3198  for (;; ptr++)
3198  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3199            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3200              {              {
3201              int occ, ocd;              unsigned int occ, ocd;
3202              int cc = c;              unsigned int cc = c;
3203              int origd = d;              unsigned int origd = d;
3204              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3205                {                {
3206                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3207                      ocd <= (unsigned int)d)
3208                    continue;                          /* Skip embedded ranges */
3209    
3210                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3211                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3212                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3213                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3214                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3215                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3216                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3217                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3218                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3219                  d = ocd;                  d = ocd;
3220                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 3262  for (;; ptr++)
3262          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3263          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3264    
3265          for (; c <= d; c++)          class_charcount += d - c + 1;
3266            class_lastchar = d;
3267    
3268            /* We can save a bit of time by skipping this in the pre-compile. */
3269    
3270            if (lengthptr == NULL) for (; c <= d; c++)
3271            {            {
3272            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3273            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3275  for (;; ptr++)
3275              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3276              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3277              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3278            }            }
3279    
3280          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3298  for (;; ptr++)
3298  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3299          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3300            {            {
3301            int othercase;            unsigned int othercase;
3302            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3303              {              {
3304              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3305              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3324  for (;; ptr++)
3324          }          }
3325        }        }
3326    
3327      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3328      loop. This "while" is the end of the "do" above. */  
3329        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3330    
3331        if (c == 0)                          /* Missing terminating ']' */
3332          {
3333          *errorcodeptr = ERR6;
3334          goto FAILED;
3335          }
3336    
3337    
3338    /* This code has been disabled because it would mean that \s counts as
3339    an explicit \r or \n reference, and that's not really what is wanted. Now
3340    we set the flag only if there is a literal "\r" or "\n" in the class. */
3341    
3342    #if 0
3343        /* Remember whether \r or \n are in this class */
3344    
3345        if (negate_class)
3346          {
3347          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3348          }
3349        else
3350          {
3351          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3352          }
3353    #endif
3354    
     while ((c = *(++ptr)) != ']' || inescq);  
3355    
3356      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3357      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3358      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3359      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3360      single-bytes only. This is an historical hangover. Maybe one day we can  
3361      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3362        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3363        operate on single-bytes only. This is an historical hangover. Maybe one day
3364        we can tidy these opcodes to handle multi-byte characters.
3365    
3366      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3367      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2206  for (;; ptr++) Line 3371  for (;; ptr++)
3371      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3372    
3373  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3374      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3375            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3376  #else  #else
3377      if (class_charcount == 1)      if (class_charcount == 1)
3378  #endif  #endif
# Line 2252  for (;; ptr++) Line 3415  for (;; ptr++)
3415      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3416    
3417      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3418      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3419      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3420        the class, so any that were explicitly given as well can be ignored. If
3421        (when there are explicit characters > 255 that must be listed) there are no
3422        characters < 256, we can omit the bitmap in the actual compiled code. */
3423    
3424  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3425      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3426        {        {
3427        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3428        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3429        code += LINK_SIZE;        code += LINK_SIZE;
3430        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3431    
3432        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3433        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3434    
3435        if (class_charcount > 0)        if (class_charcount > 0)
3436          {          {
3437          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3438            memmove(code + 32, code, class_utf8data - code);
3439          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3440          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3441          }          }
3442          else code = class_utf8data;
3443    
3444        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3445    
# Line 2289  for (;; ptr++) Line 3448  for (;; ptr++)
3448        }        }
3449  #endif  #endif
3450    
3451      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3452      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3453      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3454      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3455    
3456        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3457      if (negate_class)      if (negate_class)
3458        {        {
3459        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3460        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3461        }        }
3462      else      else
3463        {        {
       *code++ = OP_CLASS;  
3464        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3465        }        }
3466      code += 32;      code += 32;
3467      break;      break;
3468    
3469    
3470        /* ===================================================================*/
3471      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3472      has been tested above. */      has been tested above. */
3473    
# Line 2374  for (;; ptr++) Line 3535  for (;; ptr++)
3535        }        }
3536      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3537    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3538      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3539      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3540      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3568  for (;; ptr++)
3568          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3569          }          }
3570    
3571          /* If the repetition is unlimited, it pays to see if the next thing on
3572          the line is something that cannot possibly match this character. If so,
3573          automatically possessifying this item gains some performance in the case
3574          where the match fails. */
3575    
3576          if (!possessive_quantifier &&
3577              repeat_max < 0 &&
3578              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3579                options, cd))
3580            {
3581            repeat_type = 0;    /* Force greedy */
3582            possessive_quantifier = TRUE;
3583            }
3584    
3585        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3586        }        }
3587    
3588      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3589      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3590      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3591      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3592        currently used only for single-byte chars. */
3593    
3594      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3595        {        {
3596        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3597        c = previous[1];        c = previous[1];
3598          if (!possessive_quantifier &&
3599              repeat_max < 0 &&
3600              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3601            {
3602            repeat_type = 0;    /* Force greedy */
3603            possessive_quantifier = TRUE;
3604            }
3605        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3606        }        }
3607    
# Line 2450  for (;; ptr++) Line 3619  for (;; ptr++)
3619        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3620        c = *previous;        c = *previous;
3621    
3622          if (!possessive_quantifier &&
3623              repeat_max < 0 &&
3624              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3625            {
3626            repeat_type = 0;    /* Force greedy */
3627            possessive_quantifier = TRUE;
3628            }
3629    
3630        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3631        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3632          {          {
# Line 2469  for (;; ptr++) Line 3646  for (;; ptr++)
3646        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3647        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3648    
3649        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3650    
3651        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3652    
# Line 2490  for (;; ptr++) Line 3667  for (;; ptr++)
3667          }          }
3668    
3669        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3670        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3671        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3672        one less than the maximum. */        one less than the maximum. */
3673    
# Line 2543  for (;; ptr++) Line 3720  for (;; ptr++)
3720            }            }
3721    
3722          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3723          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3724            UPTO is just for 1 instance, we can use QUERY instead. */
3725    
3726          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3727            {            {
# Line 2562  for (;; ptr++) Line 3740  for (;; ptr++)
3740              *code++ = prop_value;              *code++ = prop_value;
3741              }              }
3742            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3743            *code++ = OP_UPTO + repeat_type;  
3744            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3745                {
3746                *code++ = OP_QUERY + repeat_type;
3747                }
3748              else
3749                {
3750                *code++ = OP_UPTO + repeat_type;
3751                PUT2INC(code, 0, repeat_max);
3752                }
3753            }            }
3754          }          }
3755    
# Line 2610  for (;; ptr++) Line 3796  for (;; ptr++)
3796        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3797        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3798    
3799        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3800    
3801        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3802          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2630  for (;; ptr++) Line 3816  for (;; ptr++)
3816      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3817      cases. */      cases. */
3818    
3819      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3820               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3821        {        {
3822        register int i;        register int i;
3823        int ketoffset = 0;        int ketoffset = 0;
3824        int len = code - previous;        int len = code - previous;
3825        uschar *bralink = NULL;        uschar *bralink = NULL;
3826    
3827          /* Repeating a DEFINE group is pointless */
3828    
3829          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3830            {
3831            *errorcodeptr = ERR55;
3832            goto FAILED;
3833            }
3834    
3835        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3836        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3837        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2660  for (;; ptr++) Line 3854  for (;; ptr++)
3854    
3855        if (repeat_min == 0)        if (repeat_min == 0)
3856          {          {
3857          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3858          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3859    
3860          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3861          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3862          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3863          any internal group, because the offset is from the start of the whole          **   goto END_REPEAT;
3864          regex. Temporarily terminate the pattern while doing this. */          **   }
3865    
3866            However, that fails when a group is referenced as a subroutine from
3867            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868            so that it is skipped on execution. As we don't have a list of which
3869            groups are referenced, we cannot do this selectively.
3870    
3871            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872            and do no more at this point. However, we do need to adjust any
3873            OP_RECURSE calls inside the group that refer to the group itself or any
3874            internal or forward referenced group, because the offset is from the
3875            start of the whole regex. Temporarily terminate the pattern while doing
3876            this. */
3877    
3878          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3879            {            {
3880            *code = OP_END;            *code = OP_END;
3881            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3883            code++;            code++;
3884              if (repeat_max == 0)
3885                {
3886                *previous++ = OP_SKIPZERO;
3887                goto END_REPEAT;
3888                }
3889            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3890            }            }
3891    
# Line 2696  for (;; ptr++) Line 3901  for (;; ptr++)
3901            {            {
3902            int offset;            int offset;
3903            *code = OP_END;            *code = OP_END;
3904            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3905            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3906            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3907            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3921  for (;; ptr++)
3921        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3922        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3923        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3924        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3925          forward reference subroutine calls in the group, there will be entries on
3926          the workspace list; replicate these with an appropriate increment. */
3927    
3928        else        else
3929          {          {
3930          if (repeat_min > 1)          if (repeat_min > 1)
3931            {            {
3932            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3933            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3934              potential integer overflow. */
3935    
3936              if (lengthptr != NULL)
3937                {
3938                int delta = (repeat_min - 1)*length_prevgroup;
3939                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3940                                                                (double)INT_MAX ||
3941                    OFLOW_MAX - *lengthptr < delta)
3942                  {
3943                  *errorcodeptr = ERR20;
3944                  goto FAILED;
3945                  }
3946                *lengthptr += delta;
3947                }
3948    
3949              /* This is compiling for real */
3950    
3951              else
3952              {              {
3953              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3954              code += len;              for (i = 1; i < repeat_min; i++)
3955                  {
3956                  uschar *hc;
3957                  uschar *this_hwm = cd->hwm;
3958                  memcpy(code, previous, len);
3959                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3960                    {
3961                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3962                    cd->hwm += LINK_SIZE;
3963                    }
3964                  save_hwm = this_hwm;
3965                  code += len;
3966                  }
3967              }              }
3968            }            }
3969    
3970          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3971          }          }
3972    
# Line 2736  for (;; ptr++) Line 3974  for (;; ptr++)
3974        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3975        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3976        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3977        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3978          replicate entries on the forward reference list. */
3979    
3980        if (repeat_max >= 0)        if (repeat_max >= 0)
3981          {          {
3982          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3983            just adjust the length as if we had. For each repetition we must add 1
3984            to the length for BRAZERO and for all but the last repetition we must
3985            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3986            paranoid checks to avoid integer overflow. */
3987    
3988            if (lengthptr != NULL && repeat_max > 0)
3989              {
3990              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3991                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3992              if ((double)repeat_max *
3993                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3994                      > (double)INT_MAX ||
3995                  OFLOW_MAX - *lengthptr < delta)
3996                {
3997                *errorcodeptr = ERR20;
3998                goto FAILED;
3999                }
4000              *lengthptr += delta;
4001              }
4002    
4003            /* This is compiling for real */
4004    
4005            else for (i = repeat_max - 1; i >= 0; i--)
4006            {            {
4007              uschar *hc;
4008              uschar *this_hwm = cd->hwm;
4009    
4010            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4011    
4012            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 4022  for (;; ptr++)
4022              }              }
4023    
4024            memcpy(code, previous, len);            memcpy(code, previous, len);
4025              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4026                {
4027                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4028                cd->hwm += LINK_SIZE;
4029                }
4030              save_hwm = this_hwm;
4031            code += len;            code += len;
4032            }            }
4033    
# Line 2779  for (;; ptr++) Line 4050  for (;; ptr++)
4050        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4051        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4052        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4053        correct offset was computed above. */        correct offset was computed above.
4054    
4055        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
4056          this group is a non-atomic one that could match an empty string. If so,
4057          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4058          that runtime checking can be done. [This check is also applied to
4059          atomic groups at runtime, but in a different way.] */
4060    
4061          else
4062            {
4063            uschar *ketcode = code - ketoffset;
4064            uschar *bracode = ketcode - GET(ketcode, 1);
4065            *ketcode = OP_KETRMAX + repeat_type;
4066            if (lengthptr == NULL && *bracode != OP_ONCE)
4067              {
4068              uschar *scode = bracode;
4069              do
4070                {
4071                if (could_be_empty_branch(scode, ketcode, utf8))
4072                  {
4073                  *bracode += OP_SBRA - OP_BRA;
4074                  break;
4075                  }
4076                scode += GET(scode, 1);
4077                }
4078              while (*scode == OP_ALT);
4079              }
4080            }
4081        }        }
4082    
4083      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 4088  for (;; ptr++)
4088        goto FAILED;        goto FAILED;
4089        }        }
4090    
4091      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4092      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4093      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4094      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4095      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4096        but the special opcodes can optimize it a bit. The repeated item starts at
4097        tempcode, not at previous, which might be the first part of a string whose
4098        (former) last char we repeated.
4099    
4100        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4101        an 'upto' may follow. We skip over an 'exact' item, and then test the
4102        length of what remains before proceeding. */
4103    
4104      if (possessive_quantifier)      if (possessive_quantifier)
4105        {        {
4106        int len = code - tempcode;        int len;
4107        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4108        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4109        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4110        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4111        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4112        PUTINC(code, 0, len);        len = code - tempcode;
4113        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4114            {
4115            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4116            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4117            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4118            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4119    
4120            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4121            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4122            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4123            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4124    
4125            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4126            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4127            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4128            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4129    
4130            default:
4131            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4132            code += 1 + LINK_SIZE;
4133            len += 1 + LINK_SIZE;
4134            tempcode[0] = OP_ONCE;
4135            *code++ = OP_KET;
4136            PUTINC(code, 0, len);
4137            PUT(tempcode, 1, len);
4138            break;
4139            }
4140        }        }
4141    
4142      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 4149  for (;; ptr++)
4149      break;      break;
4150    
4151    
4152      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4153      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4154      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4155      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4156    
4157      case '(':      case '(':
4158      newoptions = options;      newoptions = options;
4159      skipbytes = 0;      skipbytes = 0;
4160        bravalue = OP_CBRA;
4161        save_hwm = cd->hwm;
4162        reset_bracount = FALSE;
4163    
4164        /* First deal with various "verbs" that can be introduced by '*'. */
4165    
4166        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4167          {
4168          int i, namelen;
4169          const char *vn = verbnames;
4170          const uschar *name = ++ptr;
4171          previous = NULL;
4172          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4173          if (*ptr == ':')
4174            {
4175            *errorcodeptr = ERR59;   /* Not supported */
4176            goto FAILED;
4177            }
4178          if (*ptr != ')')
4179            {
4180            *errorcodeptr = ERR60;
4181            goto FAILED;
4182            }
4183          namelen = ptr - name;
4184          for (i = 0; i < verbcount; i++)
4185            {
4186            if (namelen == verbs[i].len &&
4187                strncmp((char *)name, vn, namelen) == 0)
4188              {
4189              *code = verbs[i].op;
4190              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4191              break;
4192              }
4193            vn += verbs[i].len + 1;
4194            }
4195          if (i < verbcount) continue;
4196          *errorcodeptr = ERR60;
4197          goto FAILED;
4198          }
4199    
4200        /* Deal with the extended parentheses; all are introduced by '?', and the
4201        appearance of any of them means that this is not a capturing group. */
4202    
4203      if (*(++ptr) == '?')      else if (*ptr == '?')
4204        {        {
4205        int set, unset;        int i, set, unset, namelen;
4206        int *optset;        int *optset;
4207          const uschar *name;
4208          uschar *slot;
4209    
4210        switch (*(++ptr))        switch (*(++ptr))
4211          {          {
4212          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4213          ptr++;          ptr++;
4214          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4215            if (*ptr == 0)
4216              {
4217              *errorcodeptr = ERR18;
4218              goto FAILED;
4219              }
4220          continue;          continue;
4221    
4222          case ':':                 /* Non-extracting bracket */  
4223            /* ------------------------------------------------------------ */
4224            case '|':                 /* Reset capture count for each branch */
4225            reset_bracount = TRUE;
4226            /* Fall through */
4227    
4228            /* ------------------------------------------------------------ */
4229            case ':':                 /* Non-capturing bracket */
4230          bravalue = OP_BRA;          bravalue = OP_BRA;
4231          ptr++;          ptr++;
4232          break;          break;
4233    
4234    
4235            /* ------------------------------------------------------------ */
4236          case '(':          case '(':
4237          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4238    
4239          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4240            group), a name (referring to a named group), or 'R', referring to
4241            recursion. R<digits> and R&name are also permitted for recursion tests.
4242    
4243            There are several syntaxes for testing a named group: (?(name)) is used
4244            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4245    
4246            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4247            be the recursive thing or the name 'R' (and similarly for 'R' followed
4248            by digits), and (b) a number could be a name that consists of digits.
4249            In both cases, we look for a name first; if not found, we try the other
4250            cases. */
4251    
4252            /* For conditions that are assertions, check the syntax, and then exit
4253            the switch. This will take control down to where bracketed groups,
4254            including assertions, are processed. */
4255    
4256            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4257              break;
4258    
4259            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4260            below), and all need to skip 3 bytes at the start of the group. */
4261    
4262            code[1+LINK_SIZE] = OP_CREF;
4263            skipbytes = 3;
4264            refsign = -1;
4265    
4266            /* Check for a test for recursion in a named group. */
4267    
4268          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
4269            {            {
4270            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4271            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4272            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4273            }            }
4274    
4275          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4276          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4277    
4278          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4279            {            {
4280            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4281            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4282            }            }
4283          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4284          set bravalue above. */            {
4285          break;            terminator = '\'';
4286              ptr++;
4287          case '=':                 /* Positive lookahead */            }
4288          bravalue = OP_ASSERT;          else
4289          ptr++;            {
4290          break;            terminator = 0;
4291              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4292              }
4293    
4294          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4295    
4296          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4297            {            {
4298            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4299            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4300            ptr++;            goto FAILED;
4301            break;            }
4302    
4303            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4304            bravalue = OP_ASSERTBACK_NOT;  
4305            recno = 0;
4306            name = ++ptr;
4307            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4308              {
4309              if (recno >= 0)
4310                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4311                  recno * 10 + *ptr - '0' : -1;
4312            ptr++;            ptr++;
           break;  
4313            }            }
4314          break;          namelen = ptr - name;
4315    
4316          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4317          bravalue = OP_ONCE;            {
4318          ptr++;            ptr--;      /* Error offset */
4319          break;            *errorcodeptr = ERR26;
4320              goto FAILED;
4321              }
4322    
4323          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4324          previous_callout = code;  /* Save for later completion */  
4325          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4326          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4327            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4328            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4329            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4330              n = n * 10 + *ptr - '0';  
4331            if (n > 255)          if (refsign > 0)
4332              {
4333              if (recno <= 0)
4334              {              {
4335              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4336              goto FAILED;              goto FAILED;
4337              }              }
4338            *code++ = n;            recno = (refsign == '-')?
4339            PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */              cd->bracount - recno + 1 : recno +cd->bracount;
4340            PUT(code, LINK_SIZE, 0);                    /* Default length */            if (recno <= 0 || recno > cd->final_bracount)
4341            code += 2 * LINK_SIZE;              {
4342                *errorcodeptr = ERR15;
4343                goto FAILED;
4344                }
4345              PUT2(code, 2+LINK_SIZE, recno);
4346              break;
4347            }            }
         previous = NULL;  
         continue;  
4348    
4349          case 'P':                 /* Named subpattern handling */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4350          if (*(++ptr) == '<')      /* Definition */          name. */
4351    
4352            slot = cd->name_table;
4353            for (i = 0; i < cd->names_found; i++)
4354            {            {
4355            int i, namelen;            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4356            uschar *slot = cd->name_table;            slot += cd->name_entry_size;
4357            const uschar *name;     /* Don't amalgamate; some compilers */            }
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4358    
4359            while (*ptr++ != '>');          /* Found a previous named subpattern */
           namelen = ptr - name - 1;  
4360    
4361            for (i = 0; i < cd->names_found; i++)          if (i < cd->names_found)
4362              {
4363              recno = GET2(slot, 0);
4364              PUT2(code, 2+LINK_SIZE, recno);
4365              }
4366    
4367            /* Search the pattern for a forward reference */
4368    
4369            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4370                            (options & PCRE_EXTENDED) != 0)) > 0)
4371              {
4372              PUT2(code, 2+LINK_SIZE, i);
4373              }
4374    
4375            /* If terminator == 0 it means that the name followed directly after
4376            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4377            some further alternatives to try. For the cases where terminator != 0
4378            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4379            now checked all the possibilities, so give an error. */
4380    
4381            else if (terminator != 0)
4382              {
4383              *errorcodeptr = ERR15;
4384              goto FAILED;
4385              }
4386    
4387            /* Check for (?(R) for recursion. Allow digits after R to specify a
4388            specific group number. */
4389    
4390            else if (*name == 'R')
4391              {
4392              recno = 0;
4393              for (i = 1; i < namelen; i++)
4394                {
4395                if ((digitab[name[i]] & ctype_digit) == 0)
4396                  {
4397                  *errorcodeptr = ERR15;
4398                  goto FAILED;
4399                  }
4400                recno = recno * 10 + name[i] - '0';
4401                }
4402              if (recno == 0) recno = RREF_ANY;
4403              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4404              PUT2(code, 2+LINK_SIZE, recno);
4405              }
4406    
4407            /* Similarly, check for the (?(DEFINE) "condition", which is always
4408            false. */
4409    
4410            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4411              {
4412              code[1+LINK_SIZE] = OP_DEF;
4413              skipbytes = 1;
4414              }
4415    
4416            /* Check for the "name" actually being a subpattern number. We are
4417            in the second pass here, so final_bracount is set. */
4418    
4419            else if (recno > 0 && recno <= cd->final_bracount)
4420              {
4421              PUT2(code, 2+LINK_SIZE, recno);
4422              }
4423    
4424            /* Either an unidentified subpattern, or a reference to (?(0) */
4425    
4426            else
4427              {
4428              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4429              goto FAILED;
4430              }
4431            break;
4432    
4433    
4434            /* ------------------------------------------------------------ */
4435            case '=':                 /* Positive lookahead */
4436            bravalue = OP_ASSERT;
4437            ptr++;
4438            break;
4439    
4440    
4441            /* ------------------------------------------------------------ */
4442            case '!':                 /* Negative lookahead */
4443            ptr++;
4444            if (*ptr == ')')          /* Optimize (?!) */
4445              {
4446              *code++ = OP_FAIL;
4447              previous = NULL;
4448              continue;
4449              }
4450            bravalue = OP_ASSERT_NOT;
4451            break;
4452    
4453    
4454            /* ------------------------------------------------------------ */
4455            case '<':                 /* Lookbehind or named define */
4456            switch (ptr[1])
4457              {
4458              case '=':               /* Positive lookbehind */
4459              bravalue = OP_ASSERTBACK;
4460              ptr += 2;
4461              break;
4462    
4463              case '!':               /* Negative lookbehind */
4464              bravalue = OP_ASSERTBACK_NOT;
4465              ptr += 2;
4466              break;
4467    
4468              default:                /* Could be name define, else bad */
4469              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4470              ptr++;                  /* Correct offset for error */
4471              *errorcodeptr = ERR24;
4472              goto FAILED;
4473              }
4474            break;
4475    
4476    
4477            /* ------------------------------------------------------------ */
4478            case '>':                 /* One-time brackets */
4479            bravalue = OP_ONCE;
4480            ptr++;
4481            break;
4482    
4483    
4484            /* ------------------------------------------------------------ */
4485            case 'C':                 /* Callout - may be followed by digits; */
4486            previous_callout = code;  /* Save for later completion */
4487            after_manual_callout = 1; /* Skip one item before completing */
4488            *code++ = OP_CALLOUT;
4489              {
4490              int n = 0;
4491              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4492                n = n * 10 + *ptr - '0';
4493              if (*ptr != ')')
4494                {
4495                *errorcodeptr = ERR39;
4496                goto FAILED;
4497                }
4498              if (n > 255)
4499                {
4500                *errorcodeptr = ERR38;
4501                goto FAILED;
4502                }
4503              *code++ = n;
4504              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4505              PUT(code, LINK_SIZE, 0);                    /* Default length */
4506              code += 2 * LINK_SIZE;
4507              }
4508            previous = NULL;
4509            continue;
4510    
4511    
4512            /* ------------------------------------------------------------ */
4513            case 'P':                 /* Python-style named subpattern handling */
4514            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4515              {
4516              is_recurse = *ptr == '>';
4517              terminator = ')';
4518              goto NAMED_REF_OR_RECURSE;
4519              }
4520            else if (*ptr != '<')    /* Test for Python-style definition */
4521              {
4522              *errorcodeptr = ERR41;
4523              goto FAILED;
4524              }
4525            /* Fall through to handle (?P< as (?< is handled */
4526    
4527    
4528            /* ------------------------------------------------------------ */
4529            DEFINE_NAME:    /* Come here from (?< handling */
4530            case '\'':
4531              {
4532              terminator = (*ptr == '<')? '>' : '\'';
4533              name = ++ptr;
4534    
4535              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4536              namelen = ptr - name;
4537    
4538              /* In the pre-compile phase, just do a syntax check. */
4539    
4540              if (lengthptr != NULL)
4541              {              {
4542              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4543              if (crc == 0)                {
4544                  *errorcodeptr = ERR42;
4545                  goto FAILED;
4546                  }
4547                if (cd->names_found >= MAX_NAME_COUNT)
4548                  {
4549                  *errorcodeptr = ERR49;
4550                  goto FAILED;
4551                  }
4552                if (namelen + 3 > cd->name_entry_size)
4553                {                {
4554                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
4555                  if (namelen > MAX_NAME_SIZE)
4556                  {                  {
4557                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4558                  goto FAILED;                  goto FAILED;
4559