/[pcre]/code/tags/pcre-7.8/pcre_compile.c
ViewVC logotype

Diff of /code/tags/pcre-7.8/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 215 by ph10, Wed Aug 15 14:20:05 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
167    
168  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 127  static const char *const posix_names[] = Line 173  static const char *const posix_names[] =
173  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
174    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
177  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
178  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
179    characters are removed, and for [:alpha:] and [:alnum:] the underscore
180    character is removed. The triples in the table consist of the base map offset,
181    second map offset or -1 if no second map, and a non-negative value for map
182    addition or a negative value for map subtraction (if there are two maps). The
183    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184    remove vertical space characters, 2 => remove underscore. */
185    
186  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
187    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
188    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
189    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
190    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
191    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
192    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
193    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
194    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
195    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
196    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
197    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
198    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
199    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
200    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 165  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 175  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 203  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized",
286      "number is too big"
287  };  };
288    
289    
# Line 229  For convenience, we use the same bit def Line 303  For convenience, we use the same bit def
303    
304  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
307  static const unsigned char digitab[] =  static const unsigned char digitab[] =
308    {    {
309    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 265  static const unsigned char digitab[] = Line 339  static const unsigned char digitab[] =
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
343  static const unsigned char digitab[] =  static const unsigned char digitab[] =
344    {    {
345    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 279  static const unsigned char digitab[] = Line 353  static const unsigned char digitab[] =
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 313  static const unsigned char ebcdic_charta Line 387  static const unsigned char ebcdic_charta
387    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
388    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
389    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
390    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
391    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
392    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
393    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 340  static const unsigned char ebcdic_charta Line 414  static const unsigned char ebcdic_charta
414  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
415    
416  static BOOL  static BOOL
417    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
419    
420    
421    
# Line 351  static BOOL Line 425  static BOOL
425    
426  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
427  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
428  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
429  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431    ptr is pointing at the \. On exit, it is on the final character of the escape
432    sequence.
433    
434  Arguments:  Arguments:
435    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 364  Arguments: Line 440  Arguments:
440    
441  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
442                   negative => a special escape sequence                   negative => a special escape sequence
443                   on error, errorptr is set                   on error, errorcodeptr is set
444  */  */
445    
446  static int  static int
447  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448    int options, BOOL isclass)    int options, BOOL isclass)
449  {  {
450  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
451    const uschar *ptr = *ptrptr + 1;
452  int c, i;  int c, i;
453    
454    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
455    ptr--;                            /* Set pointer back to the last byte */
456    
457  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
458    
 c = *(++ptr);  
459  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
460    
461  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
463  Otherwise further processing may be required. */  Otherwise further processing may be required. */
464    
465  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
466  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
467  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
468    
469  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
470  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
471  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
472  #endif  #endif
# Line 397  else if ((i = escapes[c - 0x48]) != 0) Line 476  else if ((i = escapes[c - 0x48]) != 0)
476  else  else
477    {    {
478    const uschar *oldptr;    const uschar *oldptr;
479      BOOL braced, negated;
480    
481    switch (c)    switch (c)
482      {      {
483      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 410  else Line 491  else
491      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
492      break;      break;
493    
494        /* \g must be followed by a number, either plain or braced. If positive, it
495        is an absolute backreference. If negative, it is a relative backreference.
496        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497        reference to a named group. This is part of Perl's movement towards a
498        unified syntax for back references. As this is synonymous with \k{name}, we
499        fudge it up by pretending it really was \k. */
500    
501        case 'g':
502        if (ptr[1] == '{')
503          {
504          const uschar *p;
505          for (p = ptr+2; *p != 0 && *p != '}'; p++)
506            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507          if (*p != 0 && *p != '}')
508            {
509            c = -ESC_k;
510            break;
511            }
512          braced = TRUE;
513          ptr++;
514          }
515        else braced = FALSE;
516    
517        if (ptr[1] == '-')
518          {
519          negated = TRUE;
520          ptr++;
521          }
522        else negated = FALSE;
523    
524        c = 0;
525        while ((digitab[ptr[1]] & ctype_digit) != 0)
526          c = c * 10 + *(++ptr) - '0';
527    
528        if (c < 0)
529          {
530          *errorcodeptr = ERR61;
531          break;
532          }
533    
534        if (c == 0 || (braced && *(++ptr) != '}'))
535          {
536          *errorcodeptr = ERR57;
537          break;
538          }
539    
540        if (negated)
541          {
542          if (c > bracount)
543            {
544            *errorcodeptr = ERR15;
545            break;
546            }
547          c = bracount - (c - 1);
548          }
549    
550        c = -(ESC_REF + c);
551        break;
552    
553      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
554      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
555      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 431  else Line 571  else
571        c -= '0';        c -= '0';
572        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
573          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
574          if (c < 0)
575            {
576            *errorcodeptr = ERR61;
577            break;
578            }
579        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
580          {          {
581          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 451  else Line 596  else
596        }        }
597    
598      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
599      larger first octal digit. */      larger first octal digit. The original code used just to take the least
600        significant 8 bits of octal numbers (I think this is what early Perls used
601        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602        than 3 octal digits. */
603    
604      case '0':      case '0':
605      c -= '0';      c -= '0';
606      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
608      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
609      break;      break;
610    
611      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
612      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613        treated as a data character. */
614    
615      case 'x':      case 'x':
616  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
617        {        {
618        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
619        register int count = 0;        int count = 0;
620    
621        c = 0;        c = 0;
622        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
623          {          {
624          int cc = *pt++;          register int cc = *pt++;
625            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
626          count++;          count++;
627  #if !EBCDIC    /* ASCII coding */  
628    #ifndef EBCDIC  /* ASCII coding */
629          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
630          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
632          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
633          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634  #endif  #endif
635          }          }
636    
637        if (*pt == '}')        if (*pt == '}')
638          {          {
639          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640          ptr = pt;          ptr = pt;
641          break;          break;
642          }          }
643    
644        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
645        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
646        }        }
 #endif  
647    
648      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
649    
650      c = 0;      c = 0;
651      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652        {        {
653        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
654        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
655  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
656        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
657        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
659        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
660        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661  #endif  #endif
662        }        }
663      break;      break;
664    
665      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666        This coding is ASCII-specific, but then the whole concept of \cx is
667        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668    
669      case 'c':      case 'c':
670      c = *(++ptr);      c = *(++ptr);
671      if (c == 0)      if (c == 0)
672        {        {
673        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
674        return 0;        break;
675        }        }
676    
677      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
678      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
679      c ^= 0x40;      c ^= 0x40;
680  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
681      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
682      c ^= 0xC0;      c ^= 0xC0;
683  #endif  #endif
# Line 569  escape sequence. Line 719  escape sequence.
719  Argument:  Argument:
720    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
721    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
722      dptr           points to an int that is set to the detailed property value
723    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
724    
725  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
726  */  */
727    
728  static int  static int
729  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730  {  {
731  int c, i, bot, top;  int c, i, bot, top;
732  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
733  char name[4];  char name[32];
734    
735  c = *(++ptr);  c = *(++ptr);
736  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
737    
738  *negptr = FALSE;  *negptr = FALSE;
739    
740  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741  preceded by ^ for negation. */  negation. */
742    
743  if (c == '{')  if (c == '{')
744    {    {
# Line 596  if (c == '{') Line 747  if (c == '{')
747      *negptr = TRUE;      *negptr = TRUE;
748      ptr++;      ptr++;
749      }      }
750    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
751      {      {
752      c = *(++ptr);      c = *(++ptr);
753      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
754      if (c == '}') break;      if (c == '}') break;
755      name[i] = c;      name[i] = c;
756      }      }
757    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
758    name[i] = 0;    name[i] = 0;
759    }    }
760    
# Line 628  top = _pcre_utt_size; Line 775  top = _pcre_utt_size;
775    
776  while (bot < top)  while (bot < top)
777    {    {
778    i = (bot + top)/2;    i = (bot + top) >> 1;
779    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
780    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
781        {
782        *dptr = _pcre_utt[i].value;
783        return _pcre_utt[i].type;
784        }
785    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
786    }    }
787    
 UNKNOWN_RETURN:  
788  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
789  *ptrptr = ptr;  *ptrptr = ptr;
790  return -1;  return -1;
# Line 750  return p; Line 900  return p;
900    
901    
902  /*************************************************  /*************************************************
903    *       Find forward referenced subpattern       *
904    *************************************************/
905    
906    /* This function scans along a pattern's text looking for capturing
907    subpatterns, and counting them. If it finds a named pattern that matches the
908    name it is given, it returns its number. Alternatively, if the name is NULL, it
909    returns when it reaches a given numbered subpattern. This is used for forward
910    references to subpatterns. We know that if (?P< is encountered, the name will
911    be terminated by '>' because that is checked in the first pass.
912    
913    Arguments:
914      ptr          current position in the pattern
915      count        current count of capturing parens so far encountered
916      name         name to seek, or NULL if seeking a numbered subpattern
917      lorn         name length, or subpattern number if name is NULL
918      xmode        TRUE if we are in /x mode
919    
920    Returns:       the number of the named subpattern, or -1 if not found
921    */
922    
923    static int
924    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925      BOOL xmode)
926    {
927    const uschar *thisname;
928    
929    for (; *ptr != 0; ptr++)
930      {
931      int term;
932    
933      /* Skip over backslashed characters and also entire \Q...\E */
934    
935      if (*ptr == '\\')
936        {
937        if (*(++ptr) == 0) return -1;
938        if (*ptr == 'Q') for (;;)
939          {
940          while (*(++ptr) != 0 && *ptr != '\\');
941          if (*ptr == 0) return -1;
942          if (*(++ptr) == 'E') break;
943          }
944        continue;
945        }
946    
947      /* Skip over character classes */
948    
949      if (*ptr == '[')
950        {
951        while (*(++ptr) != ']')
952          {
953          if (*ptr == 0) return -1;
954          if (*ptr == '\\')
955            {
956            if (*(++ptr) == 0) return -1;
957            if (*ptr == 'Q') for (;;)
958              {
959              while (*(++ptr) != 0 && *ptr != '\\');
960              if (*ptr == 0) return -1;
961              if (*(++ptr) == 'E') break;
962              }
963            continue;
964            }
965          }
966        continue;
967        }
968    
969      /* Skip comments in /x mode */
970    
971      if (xmode && *ptr == '#')
972        {
973        while (*(++ptr) != 0 && *ptr != '\n');
974        if (*ptr == 0) return -1;
975        continue;
976        }
977    
978      /* An opening parens must now be a real metacharacter */
979    
980      if (*ptr != '(') continue;
981      if (ptr[1] != '?' && ptr[1] != '*')
982        {
983        count++;
984        if (name == NULL && count == lorn) return count;
985        continue;
986        }
987    
988      ptr += 2;
989      if (*ptr == 'P') ptr++;                      /* Allow optional P */
990    
991      /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994           *ptr != '\'')
995        continue;
996    
997      count++;
998    
999      if (name == NULL && count == lorn) return count;
1000      term = *ptr++;
1001      if (term == '<') term = '>';
1002      thisname = ptr;
1003      while (*ptr != term) ptr++;
1004      if (name != NULL && lorn == ptr - thisname &&
1005          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006        return count;
1007      }
1008    
1009    return -1;
1010    }
1011    
1012    
1013    
1014    /*************************************************
1015  *      Find first significant op code            *  *      Find first significant op code            *
1016  *************************************************/  *************************************************/
1017    
# Line 798  for (;;) Line 1060  for (;;)
1060    
1061      case OP_CALLOUT:      case OP_CALLOUT:
1062      case OP_CREF:      case OP_CREF:
1063      case OP_BRANUMBER:      case OP_RREF:
1064        case OP_DEF:
1065      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1066      break;      break;
1067    
# Line 843  for (;;) Line 1106  for (;;)
1106    {    {
1107    int d;    int d;
1108    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1109    
1110    switch (op)    switch (op)
1111      {      {
1112        case OP_CBRA:
1113      case OP_BRA:      case OP_BRA:
1114      case OP_ONCE:      case OP_ONCE:
1115      case OP_COND:      case OP_COND:
1116      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117      if (d < 0) return d;      if (d < 0) return d;
1118      branchlength += d;      branchlength += d;
1119      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 885  for (;;) Line 1148  for (;;)
1148      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1149    
1150      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1151      case OP_CREF:      case OP_CREF:
1152        case OP_RREF:
1153        case OP_DEF:
1154      case OP_OPT:      case OP_OPT:
1155      case OP_CALLOUT:      case OP_CALLOUT:
1156      case OP_SOD:      case OP_SOD:
# Line 904  for (;;) Line 1168  for (;;)
1168    
1169      case OP_CHAR:      case OP_CHAR:
1170      case OP_CHARNC:      case OP_CHARNC:
1171        case OP_NOT:
1172      branchlength++;      branchlength++;
1173      cc += 2;      cc += 2;
1174  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 937  for (;;) Line 1202  for (;;)
1202    
1203      case OP_PROP:      case OP_PROP:
1204      case OP_NOTPROP:      case OP_NOTPROP:
1205      cc++;      cc += 2;
1206      /* Fall through */      /* Fall through */
1207    
1208      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 1018  Returns: pointer to the opcode for Line 1283  Returns: pointer to the opcode for
1283  static const uschar *  static const uschar *
1284  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1285  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1286  for (;;)  for (;;)
1287    {    {
1288    register int c = *code;    register int c = *code;
1289    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1290    else if (c > OP_BRA)  
1291      /* XCLASS is used for classes that cannot be represented just by a bit
1292      map. This includes negated single high-valued characters. The length in
1293      the table is zero; the actual length is stored in the compiled code. */
1294    
1295      if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297      /* Handle capturing bracket */
1298    
1299      else if (c == OP_CBRA)
1300      {      {
1301      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1302      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1303      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1304      }      }
1305    
1306      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307      a multi-byte character. The length in the table is a minimum, so we have to
1308      arrange to skip the extra bytes. */
1309    
1310    else    else
1311      {      {
1312      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1313  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1314      if (utf8) switch(c)      if (utf8) switch(c)
1315        {        {
1316        case OP_CHAR:        case OP_CHAR:
# Line 1051  for (;;) Line 1318  for (;;)
1318        case OP_EXACT:        case OP_EXACT:
1319        case OP_UPTO:        case OP_UPTO:
1320        case OP_MINUPTO:        case OP_MINUPTO:
1321          case OP_POSUPTO:
1322        case OP_STAR:        case OP_STAR:
1323        case OP_MINSTAR:        case OP_MINSTAR:
1324          case OP_POSSTAR:
1325        case OP_PLUS:        case OP_PLUS:
1326        case OP_MINPLUS:        case OP_MINPLUS:
1327          case OP_POSPLUS:
1328        case OP_QUERY:        case OP_QUERY:
1329        case OP_MINQUERY:        case OP_MINQUERY:
1330        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1331        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1332        break;        break;
1333        }        }
1334  #endif  #endif
# Line 1092  Returns: pointer to the opcode for Line 1355  Returns: pointer to the opcode for
1355  static const uschar *  static const uschar *
1356  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1357  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1358  for (;;)  for (;;)
1359    {    {
1360    register int c = *code;    register int c = *code;
1361    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1362    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1363    else if (c > OP_BRA)  
1364      {    /* XCLASS is used for classes that cannot be represented just by a bit
1365      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1366      }    the table is zero; the actual length is stored in the compiled code. */
1367    
1368      if (c == OP_XCLASS) code += GET(code, 1);
1369    
1370      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371      that are followed by a character may be followed by a multi-byte character.
1372      The length in the table is a minimum, so we have to arrange to skip the extra
1373      bytes. */
1374    
1375    else    else
1376      {      {
1377      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1378  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1379      if (utf8) switch(c)      if (utf8) switch(c)
1380        {        {
1381        case OP_CHAR:        case OP_CHAR:
# Line 1123  for (;;) Line 1383  for (;;)
1383        case OP_EXACT:        case OP_EXACT:
1384        case OP_UPTO:        case OP_UPTO:
1385        case OP_MINUPTO:        case OP_MINUPTO:
1386          case OP_POSUPTO:
1387        case OP_STAR:        case OP_STAR:
1388        case OP_MINSTAR:        case OP_MINSTAR:
1389          case OP_POSSTAR:
1390        case OP_PLUS:        case OP_PLUS:
1391        case OP_MINPLUS:        case OP_MINPLUS:
1392          case OP_POSPLUS:
1393        case OP_QUERY:        case OP_QUERY:
1394        case OP_MINQUERY:        case OP_MINQUERY:
1395        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1396        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1397        break;        break;
1398        }        }
1399  #endif  #endif
# Line 1152  for (;;) Line 1408  for (;;)
1408  *************************************************/  *************************************************/
1409    
1410  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1411  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1412  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1413  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1414  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415    struck an inner bracket whose current branch will already have been scanned.
1416    
1417  Arguments:  Arguments:
1418    code        points to start of search    code        points to start of search
# Line 1169  static BOOL Line 1426  static BOOL
1426  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427  {  {
1428  register int c;  register int c;
1429  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430       code < endcode;       code < endcode;
1431       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432    {    {
# Line 1177  for (code = first_significant_code(code Line 1434  for (code = first_significant_code(code
1434    
1435    c = *code;    c = *code;
1436    
1437    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1438    
1439      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440        {
1441        code += _pcre_OP_lengths[c];
1442        do code += GET(code, 1); while (*code == OP_ALT);
1443        c = *code;
1444        continue;
1445        }
1446    
1447      /* For other groups, scan the branches. */
1448    
1449      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450      {      {
1451      BOOL empty_branch;      BOOL empty_branch;
1452      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1193  for (code = first_significant_code(code Line 1462  for (code = first_significant_code(code
1462        }        }
1463      while (*code == OP_ALT);      while (*code == OP_ALT);
1464      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1465      c = *code;      c = *code;
1466        continue;
1467      }      }
1468    
1469    else switch (c)    /* Handle the other opcodes */
1470    
1471      switch (c)
1472      {      {
1473      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1474    
# Line 1253  for (code = first_significant_code(code Line 1524  for (code = first_significant_code(code
1524      case OP_NOT:      case OP_NOT:
1525      case OP_PLUS:      case OP_PLUS:
1526      case OP_MINPLUS:      case OP_MINPLUS:
1527        case OP_POSPLUS:
1528      case OP_EXACT:      case OP_EXACT:
1529      case OP_NOTPLUS:      case OP_NOTPLUS:
1530      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1531        case OP_NOTPOSPLUS:
1532      case OP_NOTEXACT:      case OP_NOTEXACT:
1533      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1534      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1535        case OP_TYPEPOSPLUS:
1536      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1537      return FALSE;      return FALSE;
1538    
# Line 1270  for (code = first_significant_code(code Line 1544  for (code = first_significant_code(code
1544      case OP_ALT:      case OP_ALT:
1545      return TRUE;      return TRUE;
1546    
1547      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1548      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1549    
1550  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1551      case OP_STAR:      case OP_STAR:
1552      case OP_MINSTAR:      case OP_MINSTAR:
1553        case OP_POSSTAR:
1554      case OP_QUERY:      case OP_QUERY:
1555      case OP_MINQUERY:      case OP_MINQUERY:
1556        case OP_POSQUERY:
1557      case OP_UPTO:      case OP_UPTO:
1558      case OP_MINUPTO:      case OP_MINUPTO:
1559        case OP_POSUPTO:
1560      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1561      break;      break;
1562  #endif  #endif
# Line 1397  earlier groups that are outside the curr Line 1674  earlier groups that are outside the curr
1674  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1675  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1676  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1677  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1678  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1679    
1680    This function has been extended with the possibility of forward references for
1681    recursions and subroutine calls. It must also check the list of such references
1682    for the group we are dealing with. If it finds that one of the recursions in
1683    the current group is on this list, it adjusts the offset in the list, not the
1684    value in the reference (which is a group number).
1685    
1686  Arguments:  Arguments:
1687    group      points to the start of the group    group      points to the start of the group
1688    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1689    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1690    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1691      save_hwm   the hwm forward reference pointer at the start of the group
1692    
1693  Returns:     nothing  Returns:     nothing
1694  */  */
1695    
1696  static void  static void
1697  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1698      uschar *save_hwm)
1699  {  {
1700  uschar *ptr = group;  uschar *ptr = group;
1701  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1702    {    {
1703    int offset = GET(ptr, 1);    int offset;
1704    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1705    
1706      /* See if this recursion is on the forward reference list. If so, adjust the
1707      reference. */
1708    
1709      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1710        {
1711        offset = GET(hc, 0);
1712        if (cd->start_code + offset == ptr + 1)
1713          {
1714          PUT(hc, 0, offset + adjust);
1715          break;
1716          }
1717        }
1718    
1719      /* Otherwise, adjust the recursion offset if it's after the start of this
1720      group. */
1721    
1722      if (hc >= cd->hwm)
1723        {
1724        offset = GET(ptr, 1);
1725        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1726        }
1727    
1728    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1729    }    }
1730  }  }
# Line 1495  Yield: TRUE when range returned; Line 1803  Yield: TRUE when range returned;
1803  */  */
1804    
1805  static BOOL  static BOOL
1806  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1807      unsigned int *odptr)
1808  {  {
1809  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1810    
1811  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1812    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1813    
1814  if (c > d) return FALSE;  if (c > d) return FALSE;
1815    
# Line 1512  next = othercase + 1; Line 1818  next = othercase + 1;
1818    
1819  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1820    {    {
1821    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1822    next++;    next++;
1823    }    }
1824    
# Line 1526  return TRUE; Line 1830  return TRUE;
1830  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1831    
1832    
1833    
1834  /*************************************************  /*************************************************
1835  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1836  *************************************************/  *************************************************/
1837    
1838  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1839  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1840  bits.  sense to automatically possessify the repeated item.
1841    
1842  Arguments:  Arguments:
1843    optionsptr     pointer to the option bits    op_code       the repeated op code
1844    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1845    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1846    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1847    errorcodeptr   points to error code variable    ptr           next character in pattern
1848    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1849    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1850    
1851  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1852  */  */
1853    
1854  static BOOL  static BOOL
1855  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1856    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1857  {  {
1858  int repeat_type, op_type;  int next;
1859  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1860  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1861  int greedy_default, greedy_non_default;  
1862  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1863  int zeroreqbyte, zerofirstbyte;    {
1864  int req_caseopt, reqvary, tempreqvary;    for (;;)
1865  int condcount = 0;      {
1866  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867  int after_manual_callout = 0;      if (*ptr == '#')
1868  register int c;        {
1869  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1870  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871  BOOL inescq = FALSE;        }
1872  BOOL groupsetfirstbyte = FALSE;      else break;
1873  const uschar *ptr = *ptrptr;      }
1874  const uschar *tempptr;    }
1875  uschar *previous = NULL;  
1876  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1877  uschar classbits[32];  value is a character, a negative value is an escape value. */
1878    
1879    if (*ptr == '\\')
1880      {
1881      int temperrorcode = 0;
1882      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1883      if (temperrorcode != 0) return FALSE;
1884      ptr++;    /* Point after the escape sequence */
1885      }
1886    
1887    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1888      {
1889  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1890  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1891  #endif  #endif
1892      next = *ptr++;
1893      }
1894    
1895  /* Set up the default and non-default settings for greediness */  else return FALSE;
1896    
1897  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1898    
1899  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1900  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1901  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1902  find one.      {
1903        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1904        if (*ptr == '#')
1905          {
1906          while (*(++ptr) != 0)
1907            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1908          }
1909        else break;
1910        }
1911      }
1912    
1913  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1914    
1915  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1916      return FALSE;
1917    
1918  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1919  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1920  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1921  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1922    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1923    
1924  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1925    
1926  for (;; ptr++)  if (next >= 0) switch(op_code)
1927    {    {
1928    BOOL negate_class;    case OP_CHAR:
1929    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1930    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1931    int class_charcount;  #endif
1932    int class_lastchar;    return item != next;
1933    int newoptions;  
1934      /* For CHARNC (caseless character) we must check the other case. If we have
1935      Unicode property support, we can use it to test the other case of
1936      high-valued characters. */
1937    
1938      case OP_CHARNC:
1939    #ifdef SUPPORT_UTF8
1940      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941    #endif
1942      if (item == next) return FALSE;
1943    #ifdef SUPPORT_UTF8
1944      if (utf8)
1945        {
1946        unsigned int othercase;
1947        if (next < 128) othercase = cd->fcc[next]; else
1948    #ifdef SUPPORT_UCP
1949        othercase = _pcre_ucp_othercase((unsigned int)next);
1950    #else
1951        othercase = NOTACHAR;
1952    #endif
1953        return (unsigned int)item != othercase;
1954        }
1955      else
1956    #endif  /* SUPPORT_UTF8 */
1957      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1958    
1959      /* For OP_NOT, "item" must be a single-byte character. */
1960    
1961      case OP_NOT:
1962      if (next < 0) return FALSE;  /* Not a character */
1963      if (item == next) return TRUE;
1964      if ((options & PCRE_CASELESS) == 0) return FALSE;
1965    #ifdef SUPPORT_UTF8
1966      if (utf8)
1967        {
1968        unsigned int othercase;
1969        if (next < 128) othercase = cd->fcc[next]; else
1970    #ifdef SUPPORT_UCP
1971        othercase = _pcre_ucp_othercase(next);
1972    #else
1973        othercase = NOTACHAR;
1974    #endif
1975        return (unsigned int)item == othercase;
1976        }
1977      else
1978    #endif  /* SUPPORT_UTF8 */
1979      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1980    
1981      case OP_DIGIT:
1982      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1983    
1984      case OP_NOT_DIGIT:
1985      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1986    
1987      case OP_WHITESPACE:
1988      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1989    
1990      case OP_NOT_WHITESPACE:
1991      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1992    
1993      case OP_WORDCHAR:
1994      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1995    
1996      case OP_NOT_WORDCHAR:
1997      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1998    
1999      case OP_HSPACE:
2000      case OP_NOT_HSPACE:
2001      switch(next)
2002        {
2003        case 0x09:
2004        case 0x20:
2005        case 0xa0:
2006        case 0x1680:
2007        case 0x180e:
2008        case 0x2000:
2009        case 0x2001:
2010        case 0x2002:
2011        case 0x2003:
2012        case 0x2004:
2013        case 0x2005:
2014        case 0x2006:
2015        case 0x2007:
2016        case 0x2008:
2017        case 0x2009:
2018        case 0x200A:
2019        case 0x202f:
2020        case 0x205f:
2021        case 0x3000:
2022        return op_code != OP_HSPACE;
2023        default:
2024        return op_code == OP_HSPACE;
2025        }
2026    
2027      case OP_VSPACE:
2028      case OP_NOT_VSPACE:
2029      switch(next)
2030        {
2031        case 0x0a:
2032        case 0x0b:
2033        case 0x0c:
2034        case 0x0d:
2035        case 0x85:
2036        case 0x2028:
2037        case 0x2029:
2038        return op_code != OP_VSPACE;
2039        default:
2040        return op_code == OP_VSPACE;
2041        }
2042    
2043      default:
2044      return FALSE;
2045      }
2046    
2047    
2048    /* Handle the case when the next item is \d, \s, etc. */
2049    
2050    switch(op_code)
2051      {
2052      case OP_CHAR:
2053      case OP_CHARNC:
2054    #ifdef SUPPORT_UTF8
2055      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2056    #endif
2057      switch(-next)
2058        {
2059        case ESC_d:
2060        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2061    
2062        case ESC_D:
2063        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2064    
2065        case ESC_s:
2066        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2067    
2068        case ESC_S:
2069        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2070    
2071        case ESC_w:
2072        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2073    
2074        case ESC_W:
2075        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2076    
2077        case ESC_h:
2078        case ESC_H:
2079        switch(item)
2080          {
2081          case 0x09:
2082          case 0x20:
2083          case 0xa0:
2084          case 0x1680:
2085          case 0x180e:
2086          case 0x2000:
2087          case 0x2001:
2088          case 0x2002:
2089          case 0x2003:
2090          case 0x2004:
2091          case 0x2005:
2092          case 0x2006:
2093          case 0x2007:
2094          case 0x2008:
2095          case 0x2009:
2096          case 0x200A:
2097          case 0x202f:
2098          case 0x205f:
2099          case 0x3000:
2100          return -next != ESC_h;
2101          default:
2102          return -next == ESC_h;
2103          }
2104    
2105        case ESC_v:
2106        case ESC_V:
2107        switch(item)
2108          {
2109          case 0x0a:
2110          case 0x0b:
2111          case 0x0c:
2112          case 0x0d:
2113          case 0x85:
2114          case 0x2028:
2115          case 0x2029:
2116          return -next != ESC_v;
2117          default:
2118          return -next == ESC_v;
2119          }
2120    
2121        default:
2122        return FALSE;
2123        }
2124    
2125      case OP_DIGIT:
2126      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2127             next == -ESC_h || next == -ESC_v;
2128    
2129      case OP_NOT_DIGIT:
2130      return next == -ESC_d;
2131    
2132      case OP_WHITESPACE:
2133      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2134    
2135      case OP_NOT_WHITESPACE:
2136      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2137    
2138      case OP_HSPACE:
2139      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2140    
2141      case OP_NOT_HSPACE:
2142      return next == -ESC_h;
2143    
2144      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2145      case OP_VSPACE:
2146      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2147    
2148      case OP_NOT_VSPACE:
2149      return next == -ESC_v;
2150    
2151      case OP_WORDCHAR:
2152      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2153    
2154      case OP_NOT_WORDCHAR:
2155      return next == -ESC_w || next == -ESC_d;
2156    
2157      default:
2158      return FALSE;
2159      }
2160    
2161    /* Control does not reach here */
2162    }
2163    
2164    
2165    
2166    /*************************************************
2167    *           Compile one branch                   *
2168    *************************************************/
2169    
2170    /* Scan the pattern, compiling it into the a vector. If the options are
2171    changed during the branch, the pointer is used to change the external options
2172    bits. This function is used during the pre-compile phase when we are trying
2173    to find out the amount of memory needed, as well as during the real compile
2174    phase. The value of lengthptr distinguishes the two phases.
2175    
2176    Arguments:
2177      optionsptr     pointer to the option bits
2178      codeptr        points to the pointer to the current code point
2179      ptrptr         points to the current pattern pointer
2180      errorcodeptr   points to error code variable
2181      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2182      reqbyteptr     set to the last literal character required, else < 0
2183      bcptr          points to current branch chain
2184      cd             contains pointers to tables etc.
2185      lengthptr      NULL during the real compile phase
2186                     points to length accumulator during pre-compile phase
2187    
2188    Returns:         TRUE on success
2189                     FALSE, with *errorcodeptr set non-zero on error
2190    */
2191    
2192    static BOOL
2193    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2194      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2195      compile_data *cd, int *lengthptr)
2196    {
2197    int repeat_type, op_type;
2198    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2199    int bravalue = 0;
2200    int greedy_default, greedy_non_default;
2201    int firstbyte, reqbyte;
2202    int zeroreqbyte, zerofirstbyte;
2203    int req_caseopt, reqvary, tempreqvary;
2204    int options = *optionsptr;
2205    int after_manual_callout = 0;
2206    int length_prevgroup = 0;
2207    register int c;
2208    register uschar *code = *codeptr;
2209    uschar *last_code = code;
2210    uschar *orig_code = code;
2211    uschar *tempcode;
2212    BOOL inescq = FALSE;
2213    BOOL groupsetfirstbyte = FALSE;
2214    const uschar *ptr = *ptrptr;
2215    const uschar *tempptr;
2216    uschar *previous = NULL;
2217    uschar *previous_callout = NULL;
2218    uschar *save_hwm = NULL;
2219    uschar classbits[32];
2220    
2221    #ifdef SUPPORT_UTF8
2222    BOOL class_utf8;
2223    BOOL utf8 = (options & PCRE_UTF8) != 0;
2224    uschar *class_utf8data;
2225    uschar utf8_char[6];
2226    #else
2227    BOOL utf8 = FALSE;
2228    uschar *utf8_char = NULL;
2229    #endif
2230    
2231    #ifdef DEBUG
2232    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2233    #endif
2234    
2235    /* Set up the default and non-default settings for greediness */
2236    
2237    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2238    greedy_non_default = greedy_default ^ 1;
2239    
2240    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2241    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2242    matches a non-fixed char first char; reqbyte just remains unset if we never
2243    find one.
2244    
2245    When we hit a repeat whose minimum is zero, we may have to adjust these values
2246    to take the zero repeat into account. This is implemented by setting them to
2247    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2248    item types that can be repeated set these backoff variables appropriately. */
2249    
2250    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2251    
2252    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2253    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2254    value > 255. It is added into the firstbyte or reqbyte variables to record the
2255    case status of the value. This is used only for ASCII characters. */
2256    
2257    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2258    
2259    /* Switch on next character until the end of the branch */
2260    
2261    for (;; ptr++)
2262      {
2263      BOOL negate_class;
2264      BOOL possessive_quantifier;
2265      BOOL is_quantifier;
2266      BOOL is_recurse;
2267      BOOL reset_bracount;
2268      int class_charcount;
2269      int class_lastchar;
2270      int newoptions;
2271    int recno;    int recno;
2272      int refsign;
2273    int skipbytes;    int skipbytes;
2274    int subreqbyte;    int subreqbyte;
2275    int subfirstbyte;    int subfirstbyte;
2276      int terminator;
2277    int mclength;    int mclength;
2278    uschar mcbuffer[8];    uschar mcbuffer[8];
2279    
2280    /* Next byte in the pattern */    /* Get next byte in the pattern */
2281    
2282    c = *ptr;    c = *ptr;
2283    
2284      /* If we are in the pre-compile phase, accumulate the length used for the
2285      previous cycle of this loop. */
2286    
2287      if (lengthptr != NULL)
2288        {
2289    #ifdef DEBUG
2290        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2291    #endif
2292        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2293          {
2294          *errorcodeptr = ERR52;
2295          goto FAILED;
2296          }
2297    
2298        /* There is at least one situation where code goes backwards: this is the
2299        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2300        the class is simply eliminated. However, it is created first, so we have to
2301        allow memory for it. Therefore, don't ever reduce the length at this point.
2302        */
2303    
2304        if (code < last_code) code = last_code;
2305    
2306        /* Paranoid check for integer overflow */
2307    
2308        if (OFLOW_MAX - *lengthptr < code - last_code)
2309          {
2310          *errorcodeptr = ERR20;
2311          goto FAILED;
2312          }
2313    
2314        *lengthptr += code - last_code;
2315        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2316    
2317        /* If "previous" is set and it is not at the start of the work space, move
2318        it back to there, in order to avoid filling up the work space. Otherwise,
2319        if "previous" is NULL, reset the current code pointer to the start. */
2320    
2321        if (previous != NULL)
2322          {
2323          if (previous > orig_code)
2324            {
2325            memmove(orig_code, previous, code - previous);
2326            code -= previous - orig_code;
2327            previous = orig_code;
2328            }
2329          }
2330        else code = orig_code;
2331    
2332        /* Remember where this code item starts so we can pick up the length
2333        next time round. */
2334    
2335        last_code = code;
2336        }
2337    
2338      /* In the real compile phase, just check the workspace used by the forward
2339      reference list. */
2340    
2341      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2342        {
2343        *errorcodeptr = ERR52;
2344        goto FAILED;
2345        }
2346    
2347    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2348    
2349    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1643  for (;; ptr++) Line 2358  for (;; ptr++)
2358        {        {
2359        if (previous_callout != NULL)        if (previous_callout != NULL)
2360          {          {
2361          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2362              complete_callout(previous_callout, ptr, cd);
2363          previous_callout = NULL;          previous_callout = NULL;
2364          }          }
2365        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1664  for (;; ptr++) Line 2380  for (;; ptr++)
2380    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2381         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2382      {      {
2383      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2384          complete_callout(previous_callout, ptr, cd);
2385      previous_callout = NULL;      previous_callout = NULL;
2386      }      }
2387    
# Line 1675  for (;; ptr++) Line 2392  for (;; ptr++)
2392      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2393      if (c == '#')      if (c == '#')
2394        {        {
2395        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2396        on the Macintosh. */          {
2397        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2398        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2399          if (*ptr != 0) continue;
2400    
2401          /* Else fall through to handle end of string */
2402          c = 0;
2403        }        }
2404      }      }
2405    
# Line 1692  for (;; ptr++) Line 2413  for (;; ptr++)
2413    
2414    switch(c)    switch(c)
2415      {      {
2416      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2417        case 0:                        /* The branch terminates at string end */
2418      case 0:      case '|':                      /* or | or ) */
     case '|':  
2419      case ')':      case ')':
2420      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2421      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2422      *codeptr = code;      *codeptr = code;
2423      *ptrptr = ptr;      *ptrptr = ptr;
2424        if (lengthptr != NULL)
2425          {
2426          if (OFLOW_MAX - *lengthptr < code - last_code)
2427            {
2428            *errorcodeptr = ERR20;
2429            goto FAILED;
2430            }
2431          *lengthptr += code - last_code;   /* To include callout length */
2432          DPRINTF((">> end branch\n"));
2433          }
2434      return TRUE;      return TRUE;
2435    
2436    
2437        /* ===================================================================*/
2438      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2439      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2440    
# Line 1731  for (;; ptr++) Line 2463  for (;; ptr++)
2463      *code++ = OP_ANY;      *code++ = OP_ANY;
2464      break;      break;
2465    
2466      /* Character classes. If the included characters are all < 255 in value, we  
2467      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2468      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2469      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2470      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2471        map as usual, then invert it at the end. However, we use a different opcode
2472        so that data characters > 255 can be handled correctly.
2473    
2474      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2475      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1756  for (;; ptr++) Line 2490  for (;; ptr++)
2490        goto FAILED;        goto FAILED;
2491        }        }
2492    
2493      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2494        if the first few characters (either before or after ^) are \Q\E or \E we
2495        skip them too. This makes for compatibility with Perl. */
2496    
2497      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2498        for (;;)
2499        {        {
       negate_class = TRUE;  
2500        c = *(++ptr);        c = *(++ptr);
2501        }        if (c == '\\')
2502      else          {
2503        {          if (ptr[1] == 'E') ptr++;
2504        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2505                else break;
2506            }
2507          else if (!negate_class && c == '^')
2508            negate_class = TRUE;
2509          else break;
2510        }        }
2511    
2512      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2513      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2514      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2515    
2516      class_charcount = 0;      class_charcount = 0;
2517      class_lastchar = -1;      class_lastchar = -1;
2518    
2519        /* Initialize the 32-char bit map to all zeros. We build the map in a
2520        temporary bit of memory, in case the class contains only 1 character (less
2521        than 256), because in that case the compiled code doesn't use the bit map.
2522        */
2523    
2524        memset(classbits, 0, 32 * sizeof(uschar));
2525    
2526  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2527      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2528      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2529  #endif  #endif
2530    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2531      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2532      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2533      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2534    
2535      do      if (c != 0) do
2536        {        {
2537          const uschar *oldptr;
2538    
2539  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2540        if (utf8 && c > 127)        if (utf8 && c > 127)
2541          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1806  for (;; ptr++) Line 2547  for (;; ptr++)
2547    
2548        if (inescq)        if (inescq)
2549          {          {
2550          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2551            {            {
2552            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2553            ptr++;            ptr++;                            /* Skip the 'E' */
2554            continue;            continue;                         /* Carry on with next */
2555            }            }
2556          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2557          }          }
2558    
2559        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1826  for (;; ptr++) Line 2567  for (;; ptr++)
2567            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2568          {          {
2569          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2570          int posix_class, i;          int posix_class, taboffset, tabopt;
2571          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2572            uschar pbits[32];
2573    
2574          if (ptr[1] != ':')          if (ptr[1] != ':')
2575            {            {
# Line 1856  for (;; ptr++) Line 2598  for (;; ptr++)
2598          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2599            posix_class = 0;            posix_class = 0;
2600    
2601          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2602          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2603          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2604          white space chars afterwards. */          result into the bit map that is being built. */
2605    
2606          posix_class *= 3;          posix_class *= 3;
2607          for (i = 0; i < 3; i++)  
2608            /* Copy in the first table (always present) */
2609    
2610            memcpy(pbits, cbits + posix_class_maps[posix_class],
2611              32 * sizeof(uschar));
2612    
2613            /* If there is a second table, add or remove it as required. */
2614    
2615            taboffset = posix_class_maps[posix_class + 1];
2616            tabopt = posix_class_maps[posix_class + 2];
2617    
2618            if (taboffset >= 0)
2619            {            {
2620            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2621            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2622            else            else
2623              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2624            }            }
2625    
2626            /* Not see if we need to remove any special characters. An option
2627            value of 1 removes vertical space and 2 removes underscore. */
2628    
2629            if (tabopt < 0) tabopt = -tabopt;
2630            if (tabopt == 1) pbits[1] &= ~0x3c;
2631              else if (tabopt == 2) pbits[11] &= 0x7f;
2632    
2633            /* Add the POSIX table or its complement into the main table that is
2634            being built and we are done. */
2635    
2636            if (local_negate)
2637              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2638            else
2639              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2640    
2641          ptr = tempptr + 1;          ptr = tempptr + 1;
2642          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2643          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2644          }          }
2645    
2646        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2647        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2648        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2649        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2650        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2651        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2652    
2653        if (c == '\\')        if (c == '\\')
2654          {          {
2655          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2656            if (*errorcodeptr != 0) goto FAILED;
2657    
2658          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2659          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2660            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2661          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2662            {            {
2663            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1915  for (;; ptr++) Line 2672  for (;; ptr++)
2672            {            {
2673            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2674            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2675            switch (-c)  
2676              /* Save time by not doing this in the pre-compile phase. */
2677    
2678              if (lengthptr == NULL) switch (-c)
2679              {              {
2680              case ESC_d:              case ESC_d:
2681              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1943  for (;; ptr++) Line 2703  for (;; ptr++)
2703              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2704              continue;              continue;
2705    
2706  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2707              continue;              continue;
 #endif  
2708    
2709              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2710              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
             treated as literals. */  
   
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2711              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
2712    
2713          }   /* End of backslash handling */            /* In the pre-compile phase, just do the recognition. */
2714    
2715        /* A single character may be followed by '-' to form a range. However,            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2716        Perl does not permit ']' to be the end of the range. A '-' character                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       here is treated as a literal. */  
2717    
2718        if (ptr[1] == '-' && ptr[2] != ']')            /* We need to deal with \H, \h, \V, and \v in both phases because
2719          {            they use extra memory. */
2720          int d;  
2721          ptr += 2;            if (-c == ESC_h)
2722                {
2723                SETBIT(classbits, 0x09); /* VT */
2724                SETBIT(classbits, 0x20); /* SPACE */
2725                SETBIT(classbits, 0xa0); /* NSBP */
2726    #ifdef SUPPORT_UTF8
2727                if (utf8)
2728                  {
2729                  class_utf8 = TRUE;
2730                  *class_utf8data++ = XCL_SINGLE;
2731                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2732                  *class_utf8data++ = XCL_SINGLE;
2733                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2734                  *class_utf8data++ = XCL_RANGE;
2735                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2736                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2737                  *class_utf8data++ = XCL_SINGLE;
2738                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2739                  *class_utf8data++ = XCL_SINGLE;
2740                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2741                  *class_utf8data++ = XCL_SINGLE;
2742                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2743                  }
2744    #endif
2745                continue;
2746                }
2747    
2748              if (-c == ESC_H)
2749                {
2750                for (c = 0; c < 32; c++)
2751                  {
2752                  int x = 0xff;
2753                  switch (c)
2754                    {
2755                    case 0x09/8: x ^= 1 << (0x09%8); break;
2756                    case 0x20/8: x ^= 1 << (0x20%8); break;
2757                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2758                    default: break;
2759                    }
2760                  classbits[c] |= x;
2761                  }
2762    
2763    #ifdef SUPPORT_UTF8
2764                if (utf8)
2765                  {
2766                  class_utf8 = TRUE;
2767                  *class_utf8data++ = XCL_RANGE;
2768                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2769                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2770                  *class_utf8data++ = XCL_RANGE;
2771                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2772                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2773                  *class_utf8data++ = XCL_RANGE;
2774                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2775                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2776                  *class_utf8data++ = XCL_RANGE;
2777                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2778                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2779                  *class_utf8data++ = XCL_RANGE;
2780                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2781                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2782                  *class_utf8data++ = XCL_RANGE;
2783                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2784                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2785                  *class_utf8data++ = XCL_RANGE;
2786                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2787                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2788                  }
2789    #endif
2790                continue;
2791                }
2792    
2793              if (-c == ESC_v)
2794                {
2795                SETBIT(classbits, 0x0a); /* LF */
2796                SETBIT(classbits, 0x0b); /* VT */
2797                SETBIT(classbits, 0x0c); /* FF */
2798                SETBIT(classbits, 0x0d); /* CR */
2799                SETBIT(classbits, 0x85); /* NEL */
2800    #ifdef SUPPORT_UTF8
2801                if (utf8)
2802                  {
2803                  class_utf8 = TRUE;
2804                  *class_utf8data++ = XCL_RANGE;
2805                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2806                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2807                  }
2808    #endif
2809                continue;
2810                }
2811    
2812              if (-c == ESC_V)
2813                {
2814                for (c = 0; c < 32; c++)
2815                  {
2816                  int x = 0xff;
2817                  switch (c)
2818                    {
2819                    case 0x0a/8: x ^= 1 << (0x0a%8);
2820                                 x ^= 1 << (0x0b%8);
2821                                 x ^= 1 << (0x0c%8);
2822                                 x ^= 1 << (0x0d%8);
2823                                 break;
2824                    case 0x85/8: x ^= 1 << (0x85%8); break;
2825                    default: break;
2826                    }
2827                  classbits[c] |= x;
2828                  }
2829    
2830    #ifdef SUPPORT_UTF8
2831                if (utf8)
2832                  {
2833                  class_utf8 = TRUE;
2834                  *class_utf8data++ = XCL_RANGE;
2835                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2836                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2837                  *class_utf8data++ = XCL_RANGE;
2838                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2839                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2840                  }
2841    #endif
2842                continue;
2843                }
2844    
2845              /* We need to deal with \P and \p in both phases. */
2846    
2847    #ifdef SUPPORT_UCP
2848              if (-c == ESC_p || -c == ESC_P)
2849                {
2850                BOOL negated;
2851                int pdata;
2852                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2853                if (ptype < 0) goto FAILED;
2854                class_utf8 = TRUE;
2855                *class_utf8data++ = ((-c == ESC_p) != negated)?
2856                  XCL_PROP : XCL_NOTPROP;
2857                *class_utf8data++ = ptype;
2858                *class_utf8data++ = pdata;
2859                class_charcount -= 2;   /* Not a < 256 character */
2860                continue;
2861                }
2862    #endif
2863              /* Unrecognized escapes are faulted if PCRE is running in its
2864              strict mode. By default, for compatibility with Perl, they are
2865              treated as literals. */
2866    
2867              if ((options & PCRE_EXTRA) != 0)
2868                {
2869                *errorcodeptr = ERR7;
2870                goto FAILED;
2871                }
2872    
2873              class_charcount -= 2;  /* Undo the default count from above */
2874              c = *ptr;              /* Get the final character and fall through */
2875              }
2876    
2877            /* Fall through if we have a single character (c >= 0). This may be
2878            greater than 256 in UTF-8 mode. */
2879    
2880            }   /* End of backslash handling */
2881    
2882          /* A single character may be followed by '-' to form a range. However,
2883          Perl does not permit ']' to be the end of the range. A '-' character
2884          at the end is treated as a literal. Perl ignores orphaned \E sequences
2885          entirely. The code for handling \Q and \E is messy. */
2886    
2887          CHECK_RANGE:
2888          while (ptr[1] == '\\' && ptr[2] == 'E')
2889            {
2890            inescq = FALSE;
2891            ptr += 2;
2892            }
2893    
2894          oldptr = ptr;
2895    
2896          if (!inescq && ptr[1] == '-')
2897            {
2898            int d;
2899            ptr += 2;
2900            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2901    
2902            /* If we hit \Q (not followed by \E) at this point, go into escaped
2903            mode. */
2904    
2905            while (*ptr == '\\' && ptr[1] == 'Q')
2906              {
2907              ptr += 2;
2908              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2909              inescq = TRUE;
2910              break;
2911              }
2912    
2913            if (*ptr == 0 || (!inescq && *ptr == ']'))
2914              {
2915              ptr = oldptr;
2916              goto LONE_SINGLE_CHARACTER;
2917              }
2918    
2919  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2920          if (utf8)          if (utf8)
# Line 2001  for (;; ptr++) Line 2929  for (;; ptr++)
2929          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2930          in such circumstances. */          in such circumstances. */
2931    
2932          if (d == '\\')          if (!inescq && d == '\\')
2933            {            {
2934            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2935            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2936    
2937            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2938            was literal */            special means the '-' was literal */
2939    
2940            if (d < 0)            if (d < 0)
2941              {              {
2942              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2943              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2944                else if (d == -ESC_R) d = 'R'; else
2945                {                {
2946                ptr = oldptr - 2;                ptr = oldptr;
2947                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2948                }                }
2949              }              }
2950            }            }
2951    
2952          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2953          the pre-pass. Optimize one-character ranges */          one-character ranges */
2954    
2955            if (d < c)
2956              {
2957              *errorcodeptr = ERR8;
2958              goto FAILED;
2959              }
2960    
2961          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2962    
# Line 2042  for (;; ptr++) Line 2977  for (;; ptr++)
2977  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2978            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2979              {              {
2980              int occ, ocd;              unsigned int occ, ocd;
2981              int cc = c;              unsigned int cc = c;
2982              int origd = d;              unsigned int origd = d;
2983              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2984                {                {
2985                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2986                      ocd <= (unsigned int)d)
2987                    continue;                          /* Skip embedded ranges */
2988    
2989                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2990                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2991                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2992                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2993                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2994                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2995                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2996                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2997                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2998                  d = ocd;                  d = ocd;
2999                  continue;                  continue;
# Line 2102  for (;; ptr++) Line 3041  for (;; ptr++)
3041          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3042          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3043    
3044          for (; c <= d; c++)          class_charcount += d - c + 1;
3045            class_lastchar = d;
3046    
3047            /* We can save a bit of time by skipping this in the pre-compile. */
3048    
3049            if (lengthptr == NULL) for (; c <= d; c++)
3050            {            {
3051            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3052            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2110  for (;; ptr++) Line 3054  for (;; ptr++)
3054              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3055              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3056              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3057            }            }
3058    
3059          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2135  for (;; ptr++) Line 3077  for (;; ptr++)
3077  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3078          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3079            {            {
3080            int chartype;            unsigned int othercase;
3081            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3082              {              {
3083              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3084              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2163  for (;; ptr++) Line 3103  for (;; ptr++)
3103          }          }
3104        }        }
3105    
3106      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3107    
3108      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3109    
3110        if (c == 0)                          /* Missing terminating ']' */
3111          {
3112          *errorcodeptr = ERR6;
3113          goto FAILED;
3114          }
3115    
3116      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3117      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2230  for (;; ptr++) Line 3175  for (;; ptr++)
3175    
3176      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3177      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3178      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3179    
3180  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3181      if (class_utf8)      if (class_utf8)
# Line 2240  for (;; ptr++) Line 3185  for (;; ptr++)
3185        code += LINK_SIZE;        code += LINK_SIZE;
3186        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3187    
3188        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3189        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3190    
3191        if (class_charcount > 0)        if (class_charcount > 0)
3192          {          {
3193          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3194            memmove(code + 32, code, class_utf8data - code);
3195          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3196          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3197          }          }
3198          else code = class_utf8data;
3199    
3200        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3201    
# Line 2274  for (;; ptr++) Line 3212  for (;; ptr++)
3212      if (negate_class)      if (negate_class)
3213        {        {
3214        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3215        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3216            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3217        }        }
3218      else      else
3219        {        {
# Line 2284  for (;; ptr++) Line 3223  for (;; ptr++)
3223      code += 32;      code += 32;
3224      break;      break;
3225    
3226    
3227        /* ===================================================================*/
3228      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3229      has been tested above. */      has been tested above. */
3230    
# Line 2351  for (;; ptr++) Line 3292  for (;; ptr++)
3292        }        }
3293      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3294    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3295      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3296      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3297      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2398  for (;; ptr++) Line 3325  for (;; ptr++)
3325          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3326          }          }
3327    
3328          /* If the repetition is unlimited, it pays to see if the next thing on
3329          the line is something that cannot possibly match this character. If so,
3330          automatically possessifying this item gains some performance in the case
3331          where the match fails. */
3332    
3333          if (!possessive_quantifier &&
3334              repeat_max < 0 &&
3335              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3336                options, cd))
3337            {
3338            repeat_type = 0;    /* Force greedy */
3339            possessive_quantifier = TRUE;
3340            }
3341    
3342        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3343        }        }
3344    
3345      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3346      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3347      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3348      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3349        currently used only for single-byte chars. */
3350    
3351      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3352        {        {
3353        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3354        c = previous[1];        c = previous[1];
3355          if (!possessive_quantifier &&
3356              repeat_max < 0 &&
3357              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3358            {
3359            repeat_type = 0;    /* Force greedy */
3360            possessive_quantifier = TRUE;
3361            }
3362        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3363        }        }
3364    
# Line 2423  for (;; ptr++) Line 3372  for (;; ptr++)
3372      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3373        {        {
3374        uschar *oldcode;        uschar *oldcode;
3375        int prop_type;        int prop_type, prop_value;
3376        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3377        c = *previous;        c = *previous;
3378    
3379          if (!possessive_quantifier &&
3380              repeat_max < 0 &&
3381              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3382            {
3383            repeat_type = 0;    /* Force greedy */
3384            possessive_quantifier = TRUE;
3385            }
3386    
3387        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3388        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3389          previous[1] : -1;          {
3390            prop_type = previous[1];
3391            prop_value = previous[2];
3392            }
3393          else prop_type = prop_value = -1;
3394    
3395        oldcode = code;        oldcode = code;
3396        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2463  for (;; ptr++) Line 3424  for (;; ptr++)
3424          }          }
3425    
3426        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3427        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3428        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3429        one less than the maximum. */        one less than the maximum. */
3430    
# Line 2490  for (;; ptr++) Line 3451  for (;; ptr++)
3451    
3452          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3453          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3454          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3455          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3456          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3457    
# Line 2506  for (;; ptr++) Line 3467  for (;; ptr++)
3467  #endif  #endif
3468              {              {
3469              *code++ = c;              *code++ = c;
3470              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3471                  {
3472                  *code++ = prop_type;
3473                  *code++ = prop_value;
3474                  }
3475              }              }
3476            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3477            }            }
3478    
3479          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3480          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3481            UPTO is just for 1 instance, we can use QUERY instead. */
3482    
3483          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3484            {            {
# Line 2525  for (;; ptr++) Line 3491  for (;; ptr++)
3491            else            else
3492  #endif  #endif
3493            *code++ = c;            *code++ = c;
3494            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3495                {
3496                *code++ = prop_type;
3497                *code++ = prop_value;
3498                }
3499            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3500            *code++ = OP_UPTO + repeat_type;  
3501            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3502                {
3503                *code++ = OP_QUERY + repeat_type;
3504                }
3505              else
3506                {
3507                *code++ = OP_UPTO + repeat_type;
3508                PUT2INC(code, 0, repeat_max);
3509                }
3510            }            }
3511          }          }
3512    
# Line 2544  for (;; ptr++) Line 3522  for (;; ptr++)
3522  #endif  #endif
3523        *code++ = c;        *code++ = c;
3524    
3525        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3526        defines the required property. */        define the required property. */
3527    
3528  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3529        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3530            {
3531            *code++ = prop_type;
3532            *code++ = prop_value;
3533            }
3534  #endif  #endif
3535        }        }
3536    
# Line 2591  for (;; ptr++) Line 3573  for (;; ptr++)
3573      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3574      cases. */      cases. */
3575    
3576      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3577               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3578        {        {
3579        register int i;        register int i;
3580        int ketoffset = 0;        int ketoffset = 0;
3581        int len = code - previous;        int len = code - previous;
3582        uschar *bralink = NULL;        uschar *bralink = NULL;
3583    
3584          /* Repeating a DEFINE group is pointless */
3585    
3586          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3587            {
3588            *errorcodeptr = ERR55;
3589            goto FAILED;
3590            }
3591    
3592        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3593        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3594        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2633  for (;; ptr++) Line 3623  for (;; ptr++)
3623          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3624          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3625          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3626          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3627          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3628            doing this. */
3629    
3630          if (repeat_max <= 1)          if (repeat_max <= 1)
3631            {            {
3632            *code = OP_END;            *code = OP_END;
3633            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3634            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3635            code++;            code++;
3636            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3648  for (;; ptr++)
3648            {            {
3649            int offset;            int offset;
3650            *code = OP_END;            *code = OP_END;
3651            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3652            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3653            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3654            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2677  for (;; ptr++) Line 3668  for (;; ptr++)
3668        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3669        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3670        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3671        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3672          forward reference subroutine calls in the group, there will be entries on
3673          the workspace list; replicate these with an appropriate increment. */
3674    
3675        else        else
3676          {          {
3677          if (repeat_min > 1)          if (repeat_min > 1)
3678            {            {
3679            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3680            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3681              potential integer overflow. */
3682    
3683              if (lengthptr != NULL)
3684                {
3685                int delta = (repeat_min - 1)*length_prevgroup;
3686                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3687                                                                (double)INT_MAX ||
3688                    OFLOW_MAX - *lengthptr < delta)
3689                  {
3690                  *errorcodeptr = ERR20;
3691                  goto FAILED;
3692                  }
3693                *lengthptr += delta;
3694                }
3695    
3696              /* This is compiling for real */
3697    
3698              else
3699              {              {
3700              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3701              code += len;              for (i = 1; i < repeat_min; i++)
3702                  {
3703                  uschar *hc;
3704                  uschar *this_hwm = cd->hwm;
3705                  memcpy(code, previous, len);
3706                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3707                    {
3708                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3709                    cd->hwm += LINK_SIZE;
3710                    }
3711                  save_hwm = this_hwm;
3712                  code += len;
3713                  }
3714              }              }
3715            }            }
3716    
3717          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3718          }          }
3719    
# Line 2697  for (;; ptr++) Line 3721  for (;; ptr++)
3721        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3722        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3723        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3724        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3725          replicate entries on the forward reference list. */
3726    
3727        if (repeat_max >= 0)        if (repeat_max >= 0)
3728          {          {
3729          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3730            just adjust the length as if we had. For each repetition we must add 1
3731            to the length for BRAZERO and for all but the last repetition we must
3732            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3733            paranoid checks to avoid integer overflow. */
3734    
3735            if (lengthptr != NULL && repeat_max > 0)
3736              {
3737              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3738                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3739              if ((double)repeat_max *
3740                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3741                      > (double)INT_MAX ||
3742                  OFLOW_MAX - *lengthptr < delta)
3743                {
3744                *errorcodeptr = ERR20;
3745                goto FAILED;
3746                }
3747              *lengthptr += delta;
3748              }
3749    
3750            /* This is compiling for real */
3751    
3752            else for (i = repeat_max - 1; i >= 0; i--)
3753            {            {
3754              uschar *hc;
3755              uschar *this_hwm = cd->hwm;
3756    
3757            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3758    
3759            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2718  for (;; ptr++) Line 3769  for (;; ptr++)
3769              }              }
3770    
3771            memcpy(code, previous, len);            memcpy(code, previous, len);
3772              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3773                {
3774                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3775                cd->hwm += LINK_SIZE;
3776                }
3777              save_hwm = this_hwm;
3778            code += len;            code += len;
3779            }            }
3780    
# Line 2740  for (;; ptr++) Line 3797  for (;; ptr++)
3797        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3798        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3799        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3800        correct offset was computed above. */        correct offset was computed above.
3801    
3802          Then, when we are doing the actual compile phase, check to see whether
3803          this group is a non-atomic one that could match an empty string. If so,
3804          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3805          that runtime checking can be done. [This check is also applied to
3806          atomic groups at runtime, but in a different way.] */
3807    
3808        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3809            {
3810            uschar *ketcode = code - ketoffset;
3811            uschar *bracode = ketcode - GET(ketcode, 1);
3812            *ketcode = OP_KETRMAX + repeat_type;
3813            if (lengthptr == NULL && *bracode != OP_ONCE)
3814              {
3815              uschar *scode = bracode;
3816              do
3817                {
3818                if (could_be_empty_branch(scode, ketcode, utf8))
3819                  {
3820                  *bracode += OP_SBRA - OP_BRA;
3821                  break;
3822                  }
3823                scode += GET(scode, 1);
3824                }
3825              while (*scode == OP_ALT);
3826              }
3827            }
3828        }        }
3829    
3830      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2753  for (;; ptr++) Line 3835  for (;; ptr++)
3835        goto FAILED;        goto FAILED;
3836        }        }
3837    
3838      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3839      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3840      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3841      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3842      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3843        but the special opcodes can optimize it a bit. The repeated item starts at
3844        tempcode, not at previous, which might be the first part of a string whose
3845        (former) last char we repeated.
3846    
3847        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3848        an 'upto' may follow. We skip over an 'exact' item, and then test the
3849        length of what remains before proceeding. */
3850    
3851      if (possessive_quantifier)      if (possessive_quantifier)
3852        {        {
3853        int len = code - tempcode;        int len;
3854        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3855        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3856        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3857        tempcode[0] = OP_ONCE;        len = code - tempcode;
3858        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3859        PUTINC(code, 0, len);          {
3860        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3861            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3862            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3863            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3864    
3865            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3866            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3867            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3868            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3869    
3870            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3871            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3872            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3873            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3874    
3875            default:
3876            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3877            code += 1 + LINK_SIZE;
3878            len += 1 + LINK_SIZE;
3879            tempcode[0] = OP_ONCE;
3880            *code++ = OP_KET;
3881            PUTINC(code, 0, len);
3882            PUT(tempcode, 1, len);
3883            break;
3884            }
3885        }        }
3886    
3887      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2781  for (;; ptr++) Line 3894  for (;; ptr++)
3894      break;      break;
3895    
3896    
3897      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3898      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3899      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3900      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3901    
3902      case '(':      case '(':
3903      newoptions = options;      newoptions = options;
3904      skipbytes = 0;      skipbytes = 0;
3905        bravalue = OP_CBRA;
3906        save_hwm = cd->hwm;
3907        reset_bracount = FALSE;
3908    
3909        /* First deal with various "verbs" that can be introduced by '*'. */
3910    
3911        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3912          {
3913          int i, namelen;
3914          const uschar *name = ++ptr;
3915          previous = NULL;
3916          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3917          if (*ptr == ':')
3918            {
3919            *errorcodeptr = ERR59;   /* Not supported */
3920            goto FAILED;
3921            }
3922          if (*ptr != ')')
3923            {
3924            *errorcodeptr = ERR60;
3925            goto FAILED;
3926            }
3927          namelen = ptr - name;
3928          for (i = 0; i < verbcount; i++)
3929            {
3930            if (namelen == verbs[i].len &&
3931                strncmp((char *)name, verbs[i].name, namelen) == 0)
3932              {
3933              *code = verbs[i].op;
3934              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3935              break;
3936              }
3937            }
3938          if (i < verbcount) continue;
3939          *errorcodeptr = ERR60;
3940          goto FAILED;
3941          }
3942    
3943        /* Deal with the extended parentheses; all are introduced by '?', and the
3944        appearance of any of them means that this is not a capturing group. */
3945    
3946      if (*(++ptr) == '?')      else if (*ptr == '?')
3947        {        {
3948        int set, unset;        int i, set, unset, namelen;
3949        int *optset;        int *optset;
3950          const uschar *name;
3951          uschar *slot;
3952    
3953        switch (*(++ptr))        switch (*(++ptr))
3954          {          {
3955          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3956          ptr++;          ptr++;
3957          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3958            if (*ptr == 0)
3959              {
3960              *errorcodeptr = ERR18;
3961              goto FAILED;
3962              }
3963          continue;          continue;
3964    
3965          case ':':                 /* Non-extracting bracket */  
3966            /* ------------------------------------------------------------ */
3967            case '|':                 /* Reset capture count for each branch */
3968            reset_bracount = TRUE;
3969            /* Fall through */
3970    
3971            /* ------------------------------------------------------------ */
3972            case ':':                 /* Non-capturing bracket */
3973          bravalue = OP_BRA;          bravalue = OP_BRA;
3974          ptr++;          ptr++;
3975          break;          break;
3976    
3977    
3978            /* ------------------------------------------------------------ */
3979          case '(':          case '(':
3980          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3981    
3982          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3983            group), a name (referring to a named group), or 'R', referring to
3984            recursion. R<digits> and R&name are also permitted for recursion tests.
3985    
3986            There are several syntaxes for testing a named group: (?(name)) is used
3987            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3988    
3989            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3990            be the recursive thing or the name 'R' (and similarly for 'R' followed
3991            by digits), and (b) a number could be a name that consists of digits.
3992            In both cases, we look for a name first; if not found, we try the other
3993            cases. */
3994    
3995            /* For conditions that are assertions, check the syntax, and then exit
3996            the switch. This will take control down to where bracketed groups,
3997            including assertions, are processed. */
3998    
3999            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4000              break;
4001    
4002            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4003            below), and all need to skip 3 bytes at the start of the group. */
4004    
4005          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
4006            skipbytes = 3;
4007            refsign = -1;
4008    
4009            /* Check for a test for recursion in a named group. */
4010    
4011            if (ptr[1] == 'R' && ptr[2] == '&')
4012            {            {
4013            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4014            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4015            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4016            }            }
4017    
4018          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4019          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4020    
4021          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4022            {            {
4023            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4024            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4025            }            }
4026          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4027          set bravalue above. */            {
4028          break;            terminator = '\'';
4029              ptr++;
4030          case '=':                 /* Positive lookahead */            }
4031          bravalue = OP_ASSERT;          else
4032          ptr++;            {
4033          break;            terminator = 0;
4034              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4035              }
4036    
4037          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4038    
4039          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4040            {            {
4041            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4042            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4043            ptr++;            goto FAILED;
4044            break;            }
4045    
4046            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4047            bravalue = OP_ASSERTBACK_NOT;  
4048            recno = 0;
4049            name = ++ptr;
4050            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4051              {
4052              if (recno >= 0)
4053                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4054                  recno * 10 + *ptr - '0' : -1;
4055            ptr++;            ptr++;
           break;  
4056            }            }
4057          break;          namelen = ptr - name;
4058    
4059          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4060          bravalue = OP_ONCE;            {
4061          ptr++;            ptr--;      /* Error offset */
4062          break;            *errorcodeptr = ERR26;
4063              goto FAILED;
4064              }
4065    
4066          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4067          previous_callout = code;  /* Save for later completion */  
4068          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4069          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4070            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4071            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4072            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4073              n = n * 10 + *ptr - '0';  
4074            if (n > 255)          if (refsign > 0)
4075              {            {
4076              if (recno <= 0)
4077                {
4078                *errorcodeptr = ERR58;
4079                goto FAILED;
4080                }
4081              if (refsign == '-')
4082                {
4083                recno = cd->bracount - recno + 1;
4084                if (recno <= 0)
4085                  {
4086                  *errorcodeptr = ERR15;
4087                  goto FAILED;
4088                  }
4089                }
4090              else recno += cd->bracount;
4091              PUT2(code, 2+LINK_SIZE, recno);
4092              break;
4093              }
4094    
4095            /* Otherwise (did not start with "+" or "-"), start by looking for the
4096            name. */
4097    
4098            slot = cd->name_table;
4099            for (i = 0; i < cd->names_found; i++)
4100              {
4101              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4102              slot += cd->name_entry_size;
4103              }
4104    
4105            /* Found a previous named subpattern */
4106    
4107            if (i < cd->names_found)
4108              {
4109              recno = GET2(slot, 0);
4110              PUT2(code, 2+LINK_SIZE, recno);
4111              }
4112    
4113            /* Search the pattern for a forward reference */
4114    
4115            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4116                            (options & PCRE_EXTENDED) != 0)) > 0)
4117              {
4118              PUT2(code, 2+LINK_SIZE, i);
4119              }
4120    
4121            /* If terminator == 0 it means that the name followed directly after
4122            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4123            some further alternatives to try. For the cases where terminator != 0
4124            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4125            now checked all the possibilities, so give an error. */
4126    
4127            else if (terminator != 0)
4128              {
4129              *errorcodeptr = ERR15;
4130              goto FAILED;
4131              }
4132    
4133            /* Check for (?(R) for recursion. Allow digits after R to specify a
4134            specific group number. */
4135    
4136            else if (*name == 'R')
4137              {
4138              recno = 0;
4139              for (i = 1; i < namelen; i++)
4140                {
4141                if ((digitab[name[i]] & ctype_digit) == 0)
4142                  {
4143                  *errorcodeptr = ERR15;
4144                  goto FAILED;
4145                  }
4146                recno = recno * 10 + name[i] - '0';
4147                }
4148              if (recno == 0) recno = RREF_ANY;
4149              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4150              PUT2(code, 2+LINK_SIZE, recno);
4151              }
4152    
4153            /* Similarly, check for the (?(DEFINE) "condition", which is always
4154            false. */
4155    
4156            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4157              {
4158              code[1+LINK_SIZE] = OP_DEF;
4159              skipbytes = 1;
4160              }
4161    
4162            /* Check for the "name" actually being a subpattern number. */
4163    
4164            else if (recno > 0)
4165              {
4166              PUT2(code, 2+LINK_SIZE, recno);
4167              }
4168    
4169            /* Either an unidentified subpattern, or a reference to (?(0) */
4170    
4171            else
4172              {
4173              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4174              goto FAILED;
4175              }
4176            break;
4177    
4178    
4179            /* ------------------------------------------------------------ */
4180            case '=':                 /* Positive lookahead */
4181            bravalue = OP_ASSERT;
4182            ptr++;
4183            break;
4184    
4185    
4186            /* ------------------------------------------------------------ */
4187            case '!':                 /* Negative lookahead */
4188            ptr++;
4189            if (*ptr == ')')          /* Optimize (?!) */
4190              {
4191              *code++ = OP_FAIL;
4192              previous = NULL;
4193              continue;
4194              }
4195            bravalue = OP_ASSERT_NOT;
4196            break;
4197    
4198    
4199            /* ------------------------------------------------------------ */
4200            case '<':                 /* Lookbehind or named define */
4201            switch (ptr[1])
4202              {
4203              case '=':               /* Positive lookbehind */
4204              bravalue = OP_ASSERTBACK;
4205              ptr += 2;
4206              break;
4207    
4208              case '!':               /* Negative lookbehind */
4209              bravalue = OP_ASSERTBACK_NOT;
4210              ptr += 2;
4211              break;
4212    
4213              default:                /* Could be name define, else bad */
4214              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4215              ptr++;                  /* Correct offset for error */
4216              *errorcodeptr = ERR24;
4217              goto FAILED;
4218              }
4219            break;
4220    
4221    
4222            /* ------------------------------------------------------------ */
4223            case '>':                 /* One-time brackets */
4224            bravalue = OP_ONCE;
4225            ptr++;
4226            break;
4227    
4228    
4229            /* ------------------------------------------------------------ */
4230            case 'C':                 /* Callout - may be followed by digits; */
4231            previous_callout = code;  /* Save for later completion */
4232            after_manual_callout = 1; /* Skip one item before completing */
4233            *code++ = OP_CALLOUT;
4234              {
4235              int n = 0;
4236              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4237                n = n * 10 + *ptr - '0';
4238              if (*ptr != ')')
4239                {
4240                *errorcodeptr = ERR39;
4241                goto FAILED;
4242                }
4243              if (n > 255)
4244                {
4245              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
4246              goto FAILED;              goto FAILED;
4247              }              }
# Line 2896  for (;; ptr++) Line 4253  for (;; ptr++)
4253          previous = NULL;          previous = NULL;
4254          continue;          continue;
4255    
4256          case 'P':                 /* Named subpattern handling */  
4257          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4258            case 'P':                 /* Python-style named subpattern handling */
4259            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4260              {
4261              is_recurse = *ptr == '>';
4262              terminator = ')';
4263              goto NAMED_REF_OR_RECURSE;
4264              }
4265            else if (*ptr != '<')    /* Test for Python-style definition */
4266              {
4267              *errorcodeptr = ERR41;
4268              goto FAILED;
4269              }
4270            /* Fall through to handle (?P< as (?< is handled */
4271    
4272    
4273            /* ------------------------------------------------------------ */
4274            DEFINE_NAME:    /* Come here from (?< handling */
4275            case '\'':
4276            {            {
4277            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4278            uschar *slot = cd->name_table;            name = ++ptr;
4279            const uschar *name;     /* Don't amalgamate; some compilers */  
4280            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4281              namelen = ptr - name;
4282    
4283            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4284    
4285            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4286              {              {
4287              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4288              if (crc == 0)                {
4289                  *errorcodeptr = ERR42;
4290                  goto FAILED;
4291                  }
4292                if (cd->names_found >= MAX_NAME_COUNT)
4293                  {
4294                  *errorcodeptr = ERR49;
4295                  goto FAILED;
4296                  }
4297                if (namelen + 3 > cd->name_entry_size)
4298                {                {
4299                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
4300                  if (namelen > MAX_NAME_SIZE)
4301                  {                  {
4302                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4303                  goto FAILED;                  goto FAILED;
4304                  }                  }
               crc = -1;             /* Current name is substring */  
4305                }                }
4306              if (crc < 0)              }
4307    
4308              /* In the real compile, create the entry in the table */
4309    
4310              else
4311                {
4312                slot = cd->name_table;
4313                for (i = 0; i < cd->names_found; i++)
4314                {                {
4315                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4316                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4317                break;                  {
4318                    if (slot[2+namelen] == 0)
4319                      {
4320                      if ((options & PCRE_DUPNAMES) == 0)
4321                        {
4322                        *errorcodeptr = ERR43;
4323                        goto FAILED;
4324                        }
4325                      }
4326                    else crc = -1;      /* Current name is substring */
4327                    }
4328                  if (crc < 0)
4329                    {
4330                    memmove(slot + cd->name_entry_size, slot,
4331                      (cd->names_found - i) * cd->name_entry_size);
4332                    break;
4333                    }
4334                  slot += cd->name_entry_size;
4335                }                }
             slot += cd->name_entry_size;  
             }  
4336    
4337            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4338            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4339            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4340            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4341            }            }
4342    
4343          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4344    
4345            ptr++;                    /* Move past > or ' */
4346            cd->names_found++;
4347            goto NUMBERED_GROUP;
4348    
4349    
4350            /* ------------------------------------------------------------ */
4351            case '&':                 /* Perl recursion/subroutine syntax */
4352            terminator = ')';
4353            is_recurse = TRUE;
4354            /* Fall through */
4355    
4356            /* We come here from the Python syntax above that handles both
4357            references (?P=name) and recursion (?P>name), as well as falling
4358            through from the Perl recursion syntax (?&name). */
4359    
4360            NAMED_REF_OR_RECURSE:
4361            name = ++ptr;
4362            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4363            namelen = ptr - name;
4364    
4365            /* In the pre-compile phase, do a syntax check and set a dummy
4366            reference number. */
4367    
4368            if (lengthptr != NULL)
4369            {            {
4370            int i, namelen;            if (*ptr != terminator)
4371            int type = *ptr++;              {
4372            const uschar *name = ptr;              *errorcodeptr = ERR42;
4373            uschar *slot = cd->name_table;              goto FAILED;
4374                }
4375              if (namelen > MAX_NAME_SIZE)
4376                {
4377                *errorcodeptr = ERR48;
4378                goto FAILED;
4379                }
4380              recno = 0;
4381              }
4382    
4383            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4384    
4385            else
4386              {
4387              slot = cd->name_table;
4388            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4389              {              {
4390              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4391              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4392              }              }
4393            if (i >= cd->names_found)  
4394              if (i < cd->names_found)         /* Back reference */
4395                {
4396                recno = GET2(slot, 0);
4397                }
4398              else if ((recno =                /* Forward back reference */
4399                        find_parens(ptr, cd->bracount, name, namelen,
4400                          (options & PCRE_EXTENDED) != 0)) <= 0)
4401              {              {
4402              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4403              goto FAILED;              goto FAILED;
4404              }              }
4405              }
4406    
4407            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4408            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4409    
4410            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4411            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4412    
         /* Should never happen */  
         break;  
4413    
4414          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4415            case 'R':                 /* Recursion */
4416          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4417          /* Fall through */          /* Fall through */
4418    
         /* Recursion or "subroutine" call */  
4419    
4420          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4421          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4422            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4423            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4424            {            {
4425            const uschar *called;            const uschar *called;
4426    
4427              if ((refsign = *ptr) == '+') ptr++;
4428              else if (refsign == '-')
4429                {
4430                if ((digitab[ptr[1]] & ctype_digit) == 0)
4431                  goto OTHER_CHAR_AFTER_QUERY;
4432                ptr++;
4433                }
4434    
4435            recno = 0;            recno = 0;
4436            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4437              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4438    
4439              if (*ptr != ')')
4440                {
4441                *errorcodeptr = ERR29;
4442                goto FAILED;
4443                }
4444    
4445              if (refsign == '-')
4446                {
4447                if (recno == 0)
4448                  {
4449                  *errorcodeptr = ERR58;
4450                  goto FAILED;
4451                  }
4452                recno = cd->bracount - recno + 1;
4453                if (recno <= 0)
4454                  {
4455                  *errorcodeptr = ERR15;
4456                  goto FAILED;
4457                  }
4458                }
4459              else if (refsign == '+')
4460                {
4461                if (recno == 0)
4462                  {
4463                  *errorcodeptr = ERR58;
4464                  goto FAILED;
4465                  }
4466                recno += cd->bracount;
4467                }
4468    
4469            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4470    
4471            HANDLE_RECURSION:            HANDLE_RECURSION:
4472    
4473            previous = code;            previous = code;
4474              called = cd->start_code;
4475    
4476            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4477            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4478              this point. If we end up with a forward reference, first check that
4479              the bracket does occur later so we can give the error (and position)
4480              now. Then remember this forward reference in the workspace so it can
4481              be filled in at the end. */
4482    
4483            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4484              {              {
4485              *errorcodeptr = ERR15;              *code = OP_END;
4486              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4487    
4488            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4489    
4490            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4491              {                {
4492              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4493              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4494                    {
4495                    *errorcodeptr = ERR15;
4496                    goto FAILED;
4497                    }
4498                  called = cd->start_code + recno;
4499                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4500                  }
4501    
4502                /* If not a forward reference, and the subpattern is still open,
4503                this is a recursive call. We check to see if this is a left
4504                recursion that could loop for ever, and diagnose that case. */
4505    
4506                else if (GET(called, 1) == 0 &&
4507                         could_be_empty(called, code, bcptr, utf8))
4508                  {
4509                  *errorcodeptr = ERR40;
4510                  goto FAILED;
4511                  }
4512              }              }
4513    
4514            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4515              "once" brackets. Set up a "previous group" length so that a
4516              subsequent quantifier will work. */
4517    
4518              *code = OP_ONCE;
4519              PUT(code, 1, 2 + 2*LINK_SIZE);
4520              code += 1 + LINK_SIZE;
4521    
4522            *code = OP_RECURSE;            *code = OP_RECURSE;
4523            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4524            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4525    
4526              *code = OP_KET;
4527              PUT(code, 1, 2 + 2*LINK_SIZE);
4528              code += 1 + LINK_SIZE;
4529    
4530              length_prevgroup = 3 + 3*LINK_SIZE;
4531            }            }
4532    
4533            /* Can't determine a first byte now */
4534    
4535            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4536          continue;          continue;
4537    
         /* Character after (? not specially recognized */  
4538    
4539          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4540            default:              /* Other characters: check option setting */
4541            OTHER_CHAR_AFTER_QUERY:
4542          set = unset = 0;          set = unset = 0;
4543          optset = &set;          optset = &set;
4544    
# Line 3036  for (;; ptr++) Line 4548  for (;; ptr++)
4548              {              {
4549              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4550    
4551                case 'J':    /* Record that it changed in the external options */
4552                *optset |= PCRE_DUPNAMES;
4553                cd->external_options |= PCRE_JCHANGED;
4554                break;
4555    
4556              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4557              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4558              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4559              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4560              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4561              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4562    
4563                default:  *errorcodeptr = ERR12;
4564                          ptr--;    /* Correct the offset */
4565                          goto FAILED;
4566              }              }
4567            }            }
4568    
# Line 3050  for (;; ptr++) Line 4571  for (;; ptr++)
4571          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4572    
4573          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4574          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4575          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4576          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4577          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4578          a group), a resetting item can be compiled.          caseless checking of required bytes.
4579    
4580          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4581          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4582          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4583            that value after the start, because it gets reset as code is discarded
4584            during the pre-compile. However, this can happen only at top level - if
4585            we are within parentheses, the starting BRA will still be present. At
4586            any parenthesis level, the length value can be used to test if anything
4587            has been compiled at that level. Thus, a test for both these conditions
4588            is necessary to ensure we correctly detect the start of the pattern in
4589            both phases.
4590    
4591            If we are not at the pattern start, compile code to change the ims
4592            options if this setting actually changes any of them. We also pass the
4593            new setting back so that it can be put at the start of any following
4594            branches, and when this group ends (if we are in a group), a resetting
4595            item can be compiled. */
4596    
4597          if (*ptr == ')')          if (*ptr == ')')
4598            {            {
4599            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4600                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4601              {              {
4602