/[pcre]/code/tags/pcre-7.8/pcre_compile.c
ViewVC logotype

Diff of /code/tags/pcre-7.8/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 218 by ph10, Thu Aug 16 10:13:23 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
167    
168  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 173  static const char *const posix_names[] =
173  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
174    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
177  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
178  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
179    characters are removed, and for [:alpha:] and [:alnum:] the underscore
180    character is removed. The triples in the table consist of the base map offset,
181    second map offset or -1 if no second map, and a non-negative value for map
182    addition or a negative value for map subtraction (if there are two maps). The
183    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184    remove vertical space characters, 2 => remove underscore. */
185    
186  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
187    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
188    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
189    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
190    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
191    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
192    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
193    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
194    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
195    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
196    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
197    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
198    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
199    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
200    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized",
286      "number is too big"
287  };  };
288    
289    
# Line 220  For convenience, we use the same bit def Line 303  For convenience, we use the same bit def
303    
304  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
307  static const unsigned char digitab[] =  static const unsigned char digitab[] =
308    {    {
309    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 339  static const unsigned char digitab[] =
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
343  static const unsigned char digitab[] =  static const unsigned char digitab[] =
344    {    {
345    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 353  static const unsigned char digitab[] =
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 387  static const unsigned char ebcdic_charta
387    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
388    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
389    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
390    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
391    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
392    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
393    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 414  static const unsigned char ebcdic_charta
414  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
415    
416  static BOOL  static BOOL
417    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
419    
420    
421    
# Line 342  static BOOL Line 425  static BOOL
425    
426  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
427  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
428  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
429  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431    ptr is pointing at the \. On exit, it is on the final character of the escape
432    sequence.
433    
434  Arguments:  Arguments:
435    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 355  Arguments: Line 440  Arguments:
440    
441  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
442                   negative => a special escape sequence                   negative => a special escape sequence
443                   on error, errorptr is set                   on error, errorcodeptr is set
444  */  */
445    
446  static int  static int
447  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448    int options, BOOL isclass)    int options, BOOL isclass)
449  {  {
450  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
451    const uschar *ptr = *ptrptr + 1;
452  int c, i;  int c, i;
453    
454    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
455    ptr--;                            /* Set pointer back to the last byte */
456    
457  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
458    
 c = *(++ptr);  
459  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
460    
461  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
463  Otherwise further processing may be required. */  Otherwise further processing may be required. */
464    
465  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
466  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
467  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
468    
469  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
470  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
471  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
472  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 476  else if ((i = escapes[c - 0x48]) != 0)
476  else  else
477    {    {
478    const uschar *oldptr;    const uschar *oldptr;
479      BOOL braced, negated;
480    
481    switch (c)    switch (c)
482      {      {
483      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 491  else
491      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
492      break;      break;
493    
494        /* \g must be followed by a number, either plain or braced. If positive, it
495        is an absolute backreference. If negative, it is a relative backreference.
496        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497        reference to a named group. This is part of Perl's movement towards a
498        unified syntax for back references. As this is synonymous with \k{name}, we
499        fudge it up by pretending it really was \k. */
500    
501        case 'g':
502        if (ptr[1] == '{')
503          {
504          const uschar *p;
505          for (p = ptr+2; *p != 0 && *p != '}'; p++)
506            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507          if (*p != 0 && *p != '}')
508            {
509            c = -ESC_k;
510            break;
511            }
512          braced = TRUE;
513          ptr++;
514          }
515        else braced = FALSE;
516    
517        if (ptr[1] == '-')
518          {
519          negated = TRUE;
520          ptr++;
521          }
522        else negated = FALSE;
523    
524        c = 0;
525        while ((digitab[ptr[1]] & ctype_digit) != 0)
526          c = c * 10 + *(++ptr) - '0';
527    
528        if (c < 0)
529          {
530          *errorcodeptr = ERR61;
531          break;
532          }
533    
534        if (c == 0 || (braced && *(++ptr) != '}'))
535          {
536          *errorcodeptr = ERR57;
537          break;
538          }
539    
540        if (negated)
541          {
542          if (c > bracount)
543            {
544            *errorcodeptr = ERR15;
545            break;
546            }
547          c = bracount - (c - 1);
548          }
549    
550        c = -(ESC_REF + c);
551        break;
552    
553      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
554      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
555      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 422  else Line 571  else
571        c -= '0';        c -= '0';
572        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
573          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
574          if (c < 0)
575            {
576            *errorcodeptr = ERR61;
577            break;
578            }
579        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
580          {          {
581          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 442  else Line 596  else
596        }        }
597    
598      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
599      larger first octal digit. */      larger first octal digit. The original code used just to take the least
600        significant 8 bits of octal numbers (I think this is what early Perls used
601        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602        than 3 octal digits. */
603    
604      case '0':      case '0':
605      c -= '0';      c -= '0';
606      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
608      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
609      break;      break;
610    
611      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
612      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613        treated as a data character. */
614    
615      case 'x':      case 'x':
616  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
617        {        {
618        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
619        register int count = 0;        int count = 0;
620    
621        c = 0;        c = 0;
622        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
623          {          {
624          int cc = *pt++;          register int cc = *pt++;
625            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
626          count++;          count++;
627  #if !EBCDIC    /* ASCII coding */  
628    #ifndef EBCDIC  /* ASCII coding */
629          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
630          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
632          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
633          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634  #endif  #endif
635          }          }
636    
637        if (*pt == '}')        if (*pt == '}')
638          {          {
639          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640          ptr = pt;          ptr = pt;
641          break;          break;
642          }          }
643    
644        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
645        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
646        }        }
 #endif  
647    
648      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
649    
650      c = 0;      c = 0;
651      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652        {        {
653        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
654        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
655  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
656        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
657        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
659        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
660        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661  #endif  #endif
662        }        }
663      break;      break;
664    
665      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666        This coding is ASCII-specific, but then the whole concept of \cx is
667        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668    
669      case 'c':      case 'c':
670      c = *(++ptr);      c = *(++ptr);
671      if (c == 0)      if (c == 0)
672        {        {
673        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
674        return 0;        break;
675        }        }
676    
677      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
678      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
679      c ^= 0x40;      c ^= 0x40;
680  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
681      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
682      c ^= 0xC0;      c ^= 0xC0;
683  #endif  #endif
# Line 560  escape sequence. Line 719  escape sequence.
719  Argument:  Argument:
720    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
721    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
722      dptr           points to an int that is set to the detailed property value
723    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
724    
725  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
726  */  */
727    
728  static int  static int
729  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730  {  {
731  int c, i, bot, top;  int c, i, bot, top;
732  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
733  char name[4];  char name[32];
734    
735  c = *(++ptr);  c = *(++ptr);
736  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
737    
738  *negptr = FALSE;  *negptr = FALSE;
739    
740  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741  preceded by ^ for negation. */  negation. */
742    
743  if (c == '{')  if (c == '{')
744    {    {
# Line 587  if (c == '{') Line 747  if (c == '{')
747      *negptr = TRUE;      *negptr = TRUE;
748      ptr++;      ptr++;
749      }      }
750    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
751      {      {
752      c = *(++ptr);      c = *(++ptr);
753      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
754      if (c == '}') break;      if (c == '}') break;
755      name[i] = c;      name[i] = c;
756      }      }
757    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
758    name[i] = 0;    name[i] = 0;
759    }    }
760    
# Line 619  top = _pcre_utt_size; Line 775  top = _pcre_utt_size;
775    
776  while (bot < top)  while (bot < top)
777    {    {
778    i = (bot + top)/2;    i = (bot + top) >> 1;
779    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
780    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
781        {
782        *dptr = _pcre_utt[i].value;
783        return _pcre_utt[i].type;
784        }
785    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
786    }    }
787    
 UNKNOWN_RETURN:  
788  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
789  *ptrptr = ptr;  *ptrptr = ptr;
790  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 857  read_repeat_counts(const uschar *p, int
857  int min = 0;  int min = 0;
858  int max = -1;  int max = -1;
859    
860    /* Read the minimum value and do a paranoid check: a negative value indicates
861    an integer overflow. */
862    
863  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864    if (min < 0 || min > 65535)
865      {
866      *errorcodeptr = ERR5;
867      return p;
868      }
869    
870    /* Read the maximum value if there is one, and again do a paranoid on its size.
871    Also, max must not be less than min. */
872    
873  if (*p == '}') max = min; else  if (*p == '}') max = min; else
874    {    {
# Line 706  if (*p == '}') max = min; else Line 876  if (*p == '}') max = min; else
876      {      {
877      max = 0;      max = 0;
878      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879        if (max < 0 || max > 65535)
880          {
881          *errorcodeptr = ERR5;
882          return p;
883          }
884      if (max < min)      if (max < min)
885        {        {
886        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 889  if (*p == '}') max = min; else
889      }      }
890    }    }
891    
892  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
893  pointer to the terminating '}'. */  '}'. */
894    
895  if (min > 65535 || max > 65535)  *minp = min;
896    *errorcodeptr = ERR5;  *maxp = max;
897  else  return p;
898    }
899    
900    
901    
902    /*************************************************
903    *       Find forward referenced subpattern       *
904    *************************************************/
905    
906    /* This function scans along a pattern's text looking for capturing
907    subpatterns, and counting them. If it finds a named pattern that matches the
908    name it is given, it returns its number. Alternatively, if the name is NULL, it
909    returns when it reaches a given numbered subpattern. This is used for forward
910    references to subpatterns. We know that if (?P< is encountered, the name will
911    be terminated by '>' because that is checked in the first pass.
912    
913    Arguments:
914      ptr          current position in the pattern
915      count        current count of capturing parens so far encountered
916      name         name to seek, or NULL if seeking a numbered subpattern
917      lorn         name length, or subpattern number if name is NULL
918      xmode        TRUE if we are in /x mode
919    
920    Returns:       the number of the named subpattern, or -1 if not found
921    */
922    
923    static int
924    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925      BOOL xmode)
926    {
927    const uschar *thisname;
928    
929    for (; *ptr != 0; ptr++)
930    {    {
931    *minp = min;    int term;
932    *maxp = max;  
933      /* Skip over backslashed characters and also entire \Q...\E */
934    
935      if (*ptr == '\\')
936        {
937        if (*(++ptr) == 0) return -1;
938        if (*ptr == 'Q') for (;;)
939          {
940          while (*(++ptr) != 0 && *ptr != '\\');
941          if (*ptr == 0) return -1;
942          if (*(++ptr) == 'E') break;
943          }
944        continue;
945        }
946    
947      /* Skip over character classes */
948    
949      if (*ptr == '[')
950        {
951        while (*(++ptr) != ']')
952          {
953          if (*ptr == 0) return -1;
954          if (*ptr == '\\')
955            {
956            if (*(++ptr) == 0) return -1;
957            if (*ptr == 'Q') for (;;)
958              {
959              while (*(++ptr) != 0 && *ptr != '\\');
960              if (*ptr == 0) return -1;
961              if (*(++ptr) == 'E') break;
962              }
963            continue;
964            }
965          }
966        continue;
967        }
968    
969      /* Skip comments in /x mode */
970    
971      if (xmode && *ptr == '#')
972        {
973        while (*(++ptr) != 0 && *ptr != '\n');
974        if (*ptr == 0) return -1;
975        continue;
976        }
977    
978      /* An opening parens must now be a real metacharacter */
979    
980      if (*ptr != '(') continue;
981      if (ptr[1] != '?' && ptr[1] != '*')
982        {
983        count++;
984        if (name == NULL && count == lorn) return count;
985        continue;
986        }
987    
988      ptr += 2;
989      if (*ptr == 'P') ptr++;                      /* Allow optional P */
990    
991      /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994           *ptr != '\'')
995        continue;
996    
997      count++;
998    
999      if (name == NULL && count == lorn) return count;
1000      term = *ptr++;
1001      if (term == '<') term = '>';
1002      thisname = ptr;
1003      while (*ptr != term) ptr++;
1004      if (name != NULL && lorn == ptr - thisname &&
1005          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006        return count;
1007    }    }
1008  return p;  
1009    return -1;
1010  }  }
1011    
1012    
# Line 778  for (;;) Line 1060  for (;;)
1060    
1061      case OP_CALLOUT:      case OP_CALLOUT:
1062      case OP_CREF:      case OP_CREF:
1063      case OP_BRANUMBER:      case OP_RREF:
1064        case OP_DEF:
1065      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1066      break;      break;
1067    
# Line 823  for (;;) Line 1106  for (;;)
1106    {    {
1107    int d;    int d;
1108    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1109    switch (op)    switch (op)
1110      {      {
1111        case OP_CBRA:
1112      case OP_BRA:      case OP_BRA:
1113      case OP_ONCE:      case OP_ONCE:
1114      case OP_COND:      case OP_COND:
1115      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116      if (d < 0) return d;      if (d < 0) return d;
1117      branchlength += d;      branchlength += d;
1118      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1147  for (;;)
1147      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1148    
1149      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1150      case OP_CREF:      case OP_CREF:
1151        case OP_RREF:
1152        case OP_DEF:
1153      case OP_OPT:      case OP_OPT:
1154      case OP_CALLOUT:      case OP_CALLOUT:
1155      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1167  for (;;)
1167    
1168      case OP_CHAR:      case OP_CHAR:
1169      case OP_CHARNC:      case OP_CHARNC:
1170        case OP_NOT:
1171      branchlength++;      branchlength++;
1172      cc += 2;      cc += 2;
1173  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 910  for (;;) Line 1194  for (;;)
1194    
1195      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1196      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1197        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198      cc += 4;      cc += 4;
1199      break;      break;
1200    
# Line 917  for (;;) Line 1202  for (;;)
1202    
1203      case OP_PROP:      case OP_PROP:
1204      case OP_NOTPROP:      case OP_NOTPROP:
1205      cc++;      cc += 2;
1206      /* Fall through */      /* Fall through */
1207    
1208      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns: pointer to the opcode for Line 1283  Returns: pointer to the opcode for
1283  static const uschar *  static const uschar *
1284  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1285  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1286  for (;;)  for (;;)
1287    {    {
1288    register int c = *code;    register int c = *code;
1289    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1290    else if (c > OP_BRA)  
1291      /* XCLASS is used for classes that cannot be represented just by a bit
1292      map. This includes negated single high-valued characters. The length in
1293      the table is zero; the actual length is stored in the compiled code. */
1294    
1295      if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297      /* Handle capturing bracket */
1298    
1299      else if (c == OP_CBRA)
1300      {      {
1301      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1302      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1303      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1304      }      }
1305    
1306      /* Otherwise, we can get the item's length from the table, except that for
1307      repeated character types, we have to test for \p and \P, which have an extra
1308      two bytes of parameters. */
1309    
1310    else    else
1311      {      {
1312        switch(c)
1313          {
1314          case OP_TYPESTAR:
1315          case OP_TYPEMINSTAR:
1316          case OP_TYPEPLUS:
1317          case OP_TYPEMINPLUS:
1318          case OP_TYPEQUERY:
1319          case OP_TYPEMINQUERY:
1320          case OP_TYPEUPTO:
1321          case OP_TYPEMINUPTO:
1322          case OP_TYPEEXACT:
1323          case OP_TYPEPOSSTAR:
1324          case OP_TYPEPOSPLUS:
1325          case OP_TYPEPOSQUERY:
1326          case OP_TYPEPOSUPTO:
1327          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1328          break;
1329          }
1330    
1331        /* Add in the fixed length from the table */
1332    
1333      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1334    
1335      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1336      a multi-byte character. The length in the table is a minimum, so we have to
1337      arrange to skip the extra bytes. */
1338    
1339  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1340      if (utf8) switch(c)      if (utf8) switch(c)
1341        {        {
1342        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1344  for (;;)
1344        case OP_EXACT:        case OP_EXACT:
1345        case OP_UPTO:        case OP_UPTO:
1346        case OP_MINUPTO:        case OP_MINUPTO:
1347          case OP_POSUPTO:
1348        case OP_STAR:        case OP_STAR:
1349        case OP_MINSTAR:        case OP_MINSTAR:
1350          case OP_POSSTAR:
1351        case OP_PLUS:        case OP_PLUS:
1352        case OP_MINPLUS:        case OP_MINPLUS:
1353          case OP_POSPLUS:
1354        case OP_QUERY:        case OP_QUERY:
1355        case OP_MINQUERY:        case OP_MINQUERY:
1356        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1357        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1358        break;        break;
1359        }        }
1360  #endif  #endif
# Line 1072  Returns: pointer to the opcode for Line 1381  Returns: pointer to the opcode for
1381  static const uschar *  static const uschar *
1382  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1383  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1384  for (;;)  for (;;)
1385    {    {
1386    register int c = *code;    register int c = *code;
1387    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1388    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1389    else if (c > OP_BRA)  
1390      {    /* XCLASS is used for classes that cannot be represented just by a bit
1391      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1392      }    the table is zero; the actual length is stored in the compiled code. */
1393    
1394      if (c == OP_XCLASS) code += GET(code, 1);
1395    
1396      /* Otherwise, we can get the item's length from the table, except that for
1397      repeated character types, we have to test for \p and \P, which have an extra
1398      two bytes of parameters. */
1399    
1400    else    else
1401      {      {
1402      code += _pcre_OP_lengths[c];      switch(c)
1403          {
1404  #ifdef SUPPORT_UTF8        case OP_TYPESTAR:
1405          case OP_TYPEMINSTAR:
1406          case OP_TYPEPLUS:
1407          case OP_TYPEMINPLUS:
1408          case OP_TYPEQUERY:
1409          case OP_TYPEMINQUERY:
1410          case OP_TYPEUPTO:
1411          case OP_TYPEMINUPTO:
1412          case OP_TYPEEXACT:
1413          case OP_TYPEPOSSTAR:
1414          case OP_TYPEPOSPLUS:
1415          case OP_TYPEPOSQUERY:
1416          case OP_TYPEPOSUPTO:
1417          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1418          break;
1419          }
1420    
1421        /* Add in the fixed length from the table */
1422    
1423        code += _pcre_OP_lengths[c];
1424    
1425      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1426      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1427      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
1428      can use relatively efficient code. */  
1429    #ifdef SUPPORT_UTF8
1430      if (utf8) switch(c)      if (utf8) switch(c)
1431        {        {
1432        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1434  for (;;)
1434        case OP_EXACT:        case OP_EXACT:
1435        case OP_UPTO:        case OP_UPTO:
1436        case OP_MINUPTO:        case OP_MINUPTO:
1437          case OP_POSUPTO:
1438        case OP_STAR:        case OP_STAR:
1439        case OP_MINSTAR:        case OP_MINSTAR:
1440          case OP_POSSTAR:
1441        case OP_PLUS:        case OP_PLUS:
1442        case OP_MINPLUS:        case OP_MINPLUS:
1443          case OP_POSPLUS:
1444        case OP_QUERY:        case OP_QUERY:
1445        case OP_MINQUERY:        case OP_MINQUERY:
1446        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1447        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1448        break;        break;
1449        }        }
1450  #endif  #endif
# Line 1132  for (;;) Line 1459  for (;;)
1459  *************************************************/  *************************************************/
1460    
1461  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1462  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1463  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1464  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1465  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1466    struck an inner bracket whose current branch will already have been scanned.
1467    
1468  Arguments:  Arguments:
1469    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1477  static BOOL
1477  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1478  {  {
1479  register int c;  register int c;
1480  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1481       code < endcode;       code < endcode;
1482       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1483    {    {
# Line 1157  for (code = first_significant_code(code Line 1485  for (code = first_significant_code(code
1485    
1486    c = *code;    c = *code;
1487    
1488    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1489    
1490      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1491        {
1492        code += _pcre_OP_lengths[c];
1493        do code += GET(code, 1); while (*code == OP_ALT);
1494        c = *code;
1495        continue;
1496        }
1497    
1498      /* For other groups, scan the branches. */
1499    
1500      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1501      {      {
1502      BOOL empty_branch;      BOOL empty_branch;
1503      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1513  for (code = first_significant_code(code
1513        }        }
1514      while (*code == OP_ALT);      while (*code == OP_ALT);
1515      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1516      c = *code;      c = *code;
1517        continue;
1518      }      }
1519    
1520    else switch (c)    /* Handle the other opcodes */
1521    
1522      switch (c)
1523      {      {
1524      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1525        cannot be represented just by a bit map. This includes negated single
1526        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1527        actual length is stored in the compiled code, so we must update "code"
1528        here. */
1529    
1530  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1531      case OP_XCLASS:      case OP_XCLASS:
1532      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1533      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1534  #endif  #endif
1535    
# Line 1233  for (code = first_significant_code(code Line 1579  for (code = first_significant_code(code
1579      case OP_NOT:      case OP_NOT:
1580      case OP_PLUS:      case OP_PLUS:
1581      case OP_MINPLUS:      case OP_MINPLUS:
1582        case OP_POSPLUS:
1583      case OP_EXACT:      case OP_EXACT:
1584      case OP_NOTPLUS:      case OP_NOTPLUS:
1585      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1586        case OP_NOTPOSPLUS:
1587      case OP_NOTEXACT:      case OP_NOTEXACT:
1588      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1589      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1590        case OP_TYPEPOSPLUS:
1591      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1592      return FALSE;      return FALSE;
1593    
# Line 1250  for (code = first_significant_code(code Line 1599  for (code = first_significant_code(code
1599      case OP_ALT:      case OP_ALT:
1600      return TRUE;      return TRUE;
1601    
1602      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1603      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1604    
1605  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1606      case OP_STAR:      case OP_STAR:
1607      case OP_MINSTAR:      case OP_MINSTAR:
1608        case OP_POSSTAR:
1609      case OP_QUERY:      case OP_QUERY:
1610      case OP_MINQUERY:      case OP_MINQUERY:
1611        case OP_POSQUERY:
1612      case OP_UPTO:      case OP_UPTO:
1613      case OP_MINUPTO:      case OP_MINUPTO:
1614        case OP_POSUPTO:
1615      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1616      break;      break;
1617  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1729  earlier groups that are outside the curr
1729  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1730  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1731  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1732  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1733  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1734    
1735    This function has been extended with the possibility of forward references for
1736    recursions and subroutine calls. It must also check the list of such references
1737    for the group we are dealing with. If it finds that one of the recursions in
1738    the current group is on this list, it adjusts the offset in the list, not the
1739    value in the reference (which is a group number).
1740    
1741  Arguments:  Arguments:
1742    group      points to the start of the group    group      points to the start of the group
1743    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1744    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1745    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1746      save_hwm   the hwm forward reference pointer at the start of the group
1747    
1748  Returns:     nothing  Returns:     nothing
1749  */  */
1750    
1751  static void  static void
1752  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1753      uschar *save_hwm)
1754  {  {
1755  uschar *ptr = group;  uschar *ptr = group;
1756  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1757    {    {
1758    int offset = GET(ptr, 1);    int offset;
1759    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1760    
1761      /* See if this recursion is on the forward reference list. If so, adjust the
1762      reference. */
1763    
1764      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1765        {
1766        offset = GET(hc, 0);
1767        if (cd->start_code + offset == ptr + 1)
1768          {
1769          PUT(hc, 0, offset + adjust);
1770          break;
1771          }
1772        }
1773    
1774      /* Otherwise, adjust the recursion offset if it's after the start of this
1775      group. */
1776    
1777      if (hc >= cd->hwm)
1778        {
1779        offset = GET(ptr, 1);
1780        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1781        }
1782    
1783    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1784    }    }
1785  }  }
# Line 1475  Yield: TRUE when range returned; Line 1858  Yield: TRUE when range returned;
1858  */  */
1859    
1860  static BOOL  static BOOL
1861  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1862      unsigned int *odptr)
1863  {  {
1864  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1865    
1866  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1867    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1868    
1869  if (c > d) return FALSE;  if (c > d) return FALSE;
1870    
# Line 1492  next = othercase + 1; Line 1873  next = othercase + 1;
1873    
1874  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1875    {    {
1876    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1877    next++;    next++;
1878    }    }
1879    
# Line 1506  return TRUE; Line 1885  return TRUE;
1885  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1886    
1887    
1888    
1889  /*************************************************  /*************************************************
1890  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1891  *************************************************/  *************************************************/
1892    
1893  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1894  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1895  bits.  sense to automatically possessify the repeated item.
1896    
1897  Arguments:  Arguments:
1898    optionsptr     pointer to the option bits    op_code       the repeated op code
1899    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1900    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1901    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1902    errorcodeptr   points to error code variable    ptr           next character in pattern
1903    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1904    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1905    
1906  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1907  */  */
1908    
1909  static BOOL  static BOOL
1910  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1911    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1912  {  {
1913  int repeat_type, op_type;  int next;
1914  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1915  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1916  int greedy_default, greedy_non_default;  
1917  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1918  int zeroreqbyte, zerofirstbyte;    {
1919  int req_caseopt, reqvary, tempreqvary;    for (;;)
1920  int condcount = 0;      {
1921  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1922  int after_manual_callout = 0;      if (*ptr == '#')
1923  register int c;        {
1924  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1925  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1926  BOOL inescq = FALSE;        }
1927  BOOL groupsetfirstbyte = FALSE;      else break;
1928  const uschar *ptr = *ptrptr;      }
1929  const uschar *tempptr;    }
1930  uschar *previous = NULL;  
1931  uschar *previous_callout = NULL;  /* If the next item is one that we can handle, get its value. A non-negative
1932  uschar classbits[32];  value is a character, a negative value is an escape value. */
1933    
1934    if (*ptr == '\\')
1935      {
1936      int temperrorcode = 0;
1937      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1938      if (temperrorcode != 0) return FALSE;
1939      ptr++;    /* Point after the escape sequence */
1940      }
1941    
1942    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1943      {
1944  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1945  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1946  #endif  #endif
1947      next = *ptr++;
1948      }
1949    
1950  /* Set up the default and non-default settings for greediness */  else return FALSE;
   
 greedy_default = ((options & PCRE_UNGREEDY) != 0);  
 greedy_non_default = greedy_default ^ 1;  
1951    
1952  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  /* Skip whitespace and comments in extended mode */
1953  matching encountered yet". It gets changed to REQ_NONE if we hit something that  
1954  matches a non-fixed char first char; reqbyte just remains unset if we never  if ((options & PCRE_EXTENDED) != 0)
1955  find one.    {
1956      for (;;)
1957        {
1958        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1959        if (*ptr == '#')
1960          {
1961          while (*(++ptr) != 0)
1962            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1963          }
1964        else break;
1965        }
1966      }
1967    
1968    /* If the next thing is itself optional, we have to give up. */
1969    
1970    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1971      return FALSE;
1972    
1973    /* Now compare the next item with the previous opcode. If the previous is a
1974    positive single character match, "item" either contains the character or, if
1975    "item" is greater than 127 in utf8 mode, the character's bytes are in
1976    utf8_char. */
1977    
1978    
1979    /* Handle cases when the next item is a character. */
1980    
1981    if (next >= 0) switch(op_code)
1982      {
1983      case OP_CHAR:
1984    #ifdef SUPPORT_UTF8
1985      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1986    #endif
1987      return item != next;
1988    
1989      /* For CHARNC (caseless character) we must check the other case. If we have
1990      Unicode property support, we can use it to test the other case of
1991      high-valued characters. */
1992    
1993      case OP_CHARNC:
1994    #ifdef SUPPORT_UTF8
1995      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1996    #endif
1997      if (item == next) return FALSE;
1998    #ifdef SUPPORT_UTF8
1999      if (utf8)
2000        {
2001        unsigned int othercase;
2002        if (next < 128) othercase = cd->fcc[next]; else
2003    #ifdef SUPPORT_UCP
2004        othercase = _pcre_ucp_othercase((unsigned int)next);
2005    #else
2006        othercase = NOTACHAR;
2007    #endif
2008        return (unsigned int)item != othercase;
2009        }
2010      else
2011    #endif  /* SUPPORT_UTF8 */
2012      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2013    
2014      /* For OP_NOT, "item" must be a single-byte character. */
2015    
2016      case OP_NOT:
2017      if (next < 0) return FALSE;  /* Not a character */
2018      if (item == next) return TRUE;
2019      if ((options & PCRE_CASELESS) == 0) return FALSE;
2020    #ifdef SUPPORT_UTF8
2021      if (utf8)
2022        {
2023        unsigned int othercase;
2024        if (next < 128) othercase = cd->fcc[next]; else
2025    #ifdef SUPPORT_UCP
2026        othercase = _pcre_ucp_othercase(next);
2027    #else
2028        othercase = NOTACHAR;
2029    #endif
2030        return (unsigned int)item == othercase;
2031        }
2032      else
2033    #endif  /* SUPPORT_UTF8 */
2034      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2035    
2036      case OP_DIGIT:
2037      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2038    
2039      case OP_NOT_DIGIT:
2040      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2041    
2042      case OP_WHITESPACE:
2043      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2044    
2045      case OP_NOT_WHITESPACE:
2046      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2047    
2048      case OP_WORDCHAR:
2049      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2050    
2051      case OP_NOT_WORDCHAR:
2052      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2053    
2054      case OP_HSPACE:
2055      case OP_NOT_HSPACE:
2056      switch(next)
2057        {
2058        case 0x09:
2059        case 0x20:
2060        case 0xa0:
2061        case 0x1680:
2062        case 0x180e:
2063        case 0x2000:
2064        case 0x2001:
2065        case 0x2002:
2066        case 0x2003:
2067        case 0x2004:
2068        case 0x2005:
2069        case 0x2006:
2070        case 0x2007:
2071        case 0x2008:
2072        case 0x2009:
2073        case 0x200A:
2074        case 0x202f:
2075        case 0x205f:
2076        case 0x3000:
2077        return op_code != OP_HSPACE;
2078        default:
2079        return op_code == OP_HSPACE;
2080        }
2081    
2082      case OP_VSPACE:
2083      case OP_NOT_VSPACE:
2084      switch(next)
2085        {
2086        case 0x0a:
2087        case 0x0b:
2088        case 0x0c:
2089        case 0x0d:
2090        case 0x85:
2091        case 0x2028:
2092        case 0x2029:
2093        return op_code != OP_VSPACE;
2094        default:
2095        return op_code == OP_VSPACE;
2096        }
2097    
2098      default:
2099      return FALSE;
2100      }
2101    
2102    
2103    /* Handle the case when the next item is \d, \s, etc. */
2104    
2105    switch(op_code)
2106      {
2107      case OP_CHAR:
2108      case OP_CHARNC:
2109    #ifdef SUPPORT_UTF8
2110      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2111    #endif
2112      switch(-next)
2113        {
2114        case ESC_d:
2115        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2116    
2117        case ESC_D:
2118        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2119    
2120        case ESC_s:
2121        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2122    
2123        case ESC_S:
2124        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2125    
2126        case ESC_w:
2127        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2128    
2129        case ESC_W:
2130        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2131    
2132        case ESC_h:
2133        case ESC_H:
2134        switch(item)
2135          {
2136          case 0x09:
2137          case 0x20:
2138          case 0xa0:
2139          case 0x1680:
2140          case 0x180e:
2141          case 0x2000:
2142          case 0x2001:
2143          case 0x2002:
2144          case 0x2003:
2145          case 0x2004:
2146          case 0x2005:
2147          case 0x2006:
2148          case 0x2007:
2149          case 0x2008:
2150          case 0x2009:
2151          case 0x200A:
2152          case 0x202f:
2153          case 0x205f:
2154          case 0x3000:
2155          return -next != ESC_h;
2156          default:
2157          return -next == ESC_h;
2158          }
2159    
2160        case ESC_v:
2161        case ESC_V:
2162        switch(item)
2163          {
2164          case 0x0a:
2165          case 0x0b:
2166          case 0x0c:
2167          case 0x0d:
2168          case 0x85:
2169          case 0x2028:
2170          case 0x2029:
2171          return -next != ESC_v;
2172          default:
2173          return -next == ESC_v;
2174          }
2175    
2176        default:
2177        return FALSE;
2178        }
2179    
2180      case OP_DIGIT:
2181      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2182             next == -ESC_h || next == -ESC_v;
2183    
2184      case OP_NOT_DIGIT:
2185      return next == -ESC_d;
2186    
2187      case OP_WHITESPACE:
2188      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2189    
2190      case OP_NOT_WHITESPACE:
2191      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2192    
2193      case OP_HSPACE:
2194      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2195    
2196      case OP_NOT_HSPACE:
2197      return next == -ESC_h;
2198    
2199      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2200      case OP_VSPACE:
2201      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2202    
2203      case OP_NOT_VSPACE:
2204      return next == -ESC_v;
2205    
2206      case OP_WORDCHAR:
2207      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2208    
2209      case OP_NOT_WORDCHAR:
2210      return next == -ESC_w || next == -ESC_d;
2211    
2212      default:
2213      return FALSE;
2214      }
2215    
2216    /* Control does not reach here */
2217    }
2218    
2219    
2220    
2221    /*************************************************
2222    *           Compile one branch                   *
2223    *************************************************/
2224    
2225    /* Scan the pattern, compiling it into the a vector. If the options are
2226    changed during the branch, the pointer is used to change the external options
2227    bits. This function is used during the pre-compile phase when we are trying
2228    to find out the amount of memory needed, as well as during the real compile
2229    phase. The value of lengthptr distinguishes the two phases.
2230    
2231    Arguments:
2232      optionsptr     pointer to the option bits
2233      codeptr        points to the pointer to the current code point
2234      ptrptr         points to the current pattern pointer
2235      errorcodeptr   points to error code variable
2236      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2237      reqbyteptr     set to the last literal character required, else < 0
2238      bcptr          points to current branch chain
2239      cd             contains pointers to tables etc.
2240      lengthptr      NULL during the real compile phase
2241                     points to length accumulator during pre-compile phase
2242    
2243    Returns:         TRUE on success
2244                     FALSE, with *errorcodeptr set non-zero on error
2245    */
2246    
2247    static BOOL
2248    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2249      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2250      compile_data *cd, int *lengthptr)
2251    {
2252    int repeat_type, op_type;
2253    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2254    int bravalue = 0;
2255    int greedy_default, greedy_non_default;
2256    int firstbyte, reqbyte;
2257    int zeroreqbyte, zerofirstbyte;
2258    int req_caseopt, reqvary, tempreqvary;
2259    int options = *optionsptr;
2260    int after_manual_callout = 0;
2261    int length_prevgroup = 0;
2262    register int c;
2263    register uschar *code = *codeptr;
2264    uschar *last_code = code;
2265    uschar *orig_code = code;
2266    uschar *tempcode;
2267    BOOL inescq = FALSE;
2268    BOOL groupsetfirstbyte = FALSE;
2269    const uschar *ptr = *ptrptr;
2270    const uschar *tempptr;
2271    uschar *previous = NULL;
2272    uschar *previous_callout = NULL;
2273    uschar *save_hwm = NULL;
2274    uschar classbits[32];
2275    
2276    #ifdef SUPPORT_UTF8
2277    BOOL class_utf8;
2278    BOOL utf8 = (options & PCRE_UTF8) != 0;
2279    uschar *class_utf8data;
2280    uschar utf8_char[6];
2281    #else
2282    BOOL utf8 = FALSE;
2283    uschar *utf8_char = NULL;
2284    #endif
2285    
2286    #ifdef DEBUG
2287    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2288    #endif
2289    
2290    /* Set up the default and non-default settings for greediness */
2291    
2292    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2293    greedy_non_default = greedy_default ^ 1;
2294    
2295    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2296    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2297    matches a non-fixed char first char; reqbyte just remains unset if we never
2298    find one.
2299    
2300  When we hit a repeat whose minimum is zero, we may have to adjust these values  When we hit a repeat whose minimum is zero, we may have to adjust these values
2301  to take the zero repeat into account. This is implemented by setting them to  to take the zero repeat into account. This is implemented by setting them to
# Line 1595  for (;; ptr++) Line 2318  for (;; ptr++)
2318    BOOL negate_class;    BOOL negate_class;
2319    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2320    BOOL is_quantifier;    BOOL is_quantifier;
2321      BOOL is_recurse;
2322      BOOL reset_bracount;
2323    int class_charcount;    int class_charcount;
2324    int class_lastchar;    int class_lastchar;
2325    int newoptions;    int newoptions;
2326    int recno;    int recno;
2327      int refsign;
2328    int skipbytes;    int skipbytes;
2329    int subreqbyte;    int subreqbyte;
2330    int subfirstbyte;    int subfirstbyte;
2331      int terminator;
2332    int mclength;    int mclength;
2333    uschar mcbuffer[8];    uschar mcbuffer[8];
2334    
2335    /* Next byte in the pattern */    /* Get next byte in the pattern */
2336    
2337    c = *ptr;    c = *ptr;
2338    
2339      /* If we are in the pre-compile phase, accumulate the length used for the
2340      previous cycle of this loop. */
2341    
2342      if (lengthptr != NULL)
2343        {
2344    #ifdef DEBUG
2345        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2346    #endif
2347        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2348          {
2349          *errorcodeptr = ERR52;
2350          goto FAILED;
2351          }
2352    
2353        /* There is at least one situation where code goes backwards: this is the
2354        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2355        the class is simply eliminated. However, it is created first, so we have to
2356        allow memory for it. Therefore, don't ever reduce the length at this point.
2357        */
2358    
2359        if (code < last_code) code = last_code;
2360    
2361        /* Paranoid check for integer overflow */
2362    
2363        if (OFLOW_MAX - *lengthptr < code - last_code)
2364          {
2365          *errorcodeptr = ERR20;
2366          goto FAILED;
2367          }
2368    
2369        *lengthptr += code - last_code;
2370        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2371    
2372        /* If "previous" is set and it is not at the start of the work space, move
2373        it back to there, in order to avoid filling up the work space. Otherwise,
2374        if "previous" is NULL, reset the current code pointer to the start. */
2375    
2376        if (previous != NULL)
2377          {
2378          if (previous > orig_code)
2379            {
2380            memmove(orig_code, previous, code - previous);
2381            code -= previous - orig_code;
2382            previous = orig_code;
2383            }
2384          }
2385        else code = orig_code;
2386    
2387        /* Remember where this code item starts so we can pick up the length
2388        next time round. */
2389    
2390        last_code = code;
2391        }
2392    
2393      /* In the real compile phase, just check the workspace used by the forward
2394      reference list. */
2395    
2396      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2397        {
2398        *errorcodeptr = ERR52;
2399        goto FAILED;
2400        }
2401    
2402    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2403    
2404    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2413  for (;; ptr++)
2413        {        {
2414        if (previous_callout != NULL)        if (previous_callout != NULL)
2415          {          {
2416          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2417              complete_callout(previous_callout, ptr, cd);
2418          previous_callout = NULL;          previous_callout = NULL;
2419          }          }
2420        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2435  for (;; ptr++)
2435    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2436         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2437      {      {
2438      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2439          complete_callout(previous_callout, ptr, cd);
2440      previous_callout = NULL;      previous_callout = NULL;
2441      }      }
2442    
# Line 1655  for (;; ptr++) Line 2447  for (;; ptr++)
2447      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2448      if (c == '#')      if (c == '#')
2449        {        {
2450        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2451        on the Macintosh. */          {
2452        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2453        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2454          if (*ptr != 0) continue;
2455    
2456          /* Else fall through to handle end of string */
2457          c = 0;
2458        }        }
2459      }      }
2460    
# Line 1672  for (;; ptr++) Line 2468  for (;; ptr++)
2468    
2469    switch(c)    switch(c)
2470      {      {
2471      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2472        case 0:                        /* The branch terminates at string end */
2473      case 0:      case '|':                      /* or | or ) */
     case '|':  
2474      case ')':      case ')':
2475      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2476      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2477      *codeptr = code;      *codeptr = code;
2478      *ptrptr = ptr;      *ptrptr = ptr;
2479        if (lengthptr != NULL)
2480          {
2481          if (OFLOW_MAX - *lengthptr < code - last_code)
2482            {
2483            *errorcodeptr = ERR20;
2484            goto FAILED;
2485            }
2486          *lengthptr += code - last_code;   /* To include callout length */
2487          DPRINTF((">> end branch\n"));
2488          }
2489      return TRUE;      return TRUE;
2490    
2491    
2492        /* ===================================================================*/
2493      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2494      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2495    
# Line 1711  for (;; ptr++) Line 2518  for (;; ptr++)
2518      *code++ = OP_ANY;      *code++ = OP_ANY;
2519      break;      break;
2520    
2521      /* Character classes. If the included characters are all < 255 in value, we  
2522      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2523      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2524      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2525      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2526        map as usual, then invert it at the end. However, we use a different opcode
2527        so that data characters > 255 can be handled correctly.
2528    
2529      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2530      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1736  for (;; ptr++) Line 2545  for (;; ptr++)
2545        goto FAILED;        goto FAILED;
2546        }        }
2547    
2548      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2549        if the first few characters (either before or after ^) are \Q\E or \E we
2550        skip them too. This makes for compatibility with Perl. */
2551    
2552      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2553        for (;;)
2554        {        {
       negate_class = TRUE;  
2555        c = *(++ptr);        c = *(++ptr);
2556        }        if (c == '\\')
2557      else          {
2558        {          if (ptr[1] == 'E') ptr++;
2559        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2560                else break;
2561            }
2562          else if (!negate_class && c == '^')
2563            negate_class = TRUE;
2564          else break;
2565        }        }
2566    
2567      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2568      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2569      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2570    
2571      class_charcount = 0;      class_charcount = 0;
2572      class_lastchar = -1;      class_lastchar = -1;
2573    
2574        /* Initialize the 32-char bit map to all zeros. We build the map in a
2575        temporary bit of memory, in case the class contains only 1 character (less
2576        than 256), because in that case the compiled code doesn't use the bit map.
2577        */
2578    
2579        memset(classbits, 0, 32 * sizeof(uschar));
2580    
2581  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2582      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2583      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2584  #endif  #endif
2585    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2586      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2587      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2588      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2589    
2590      do      if (c != 0) do
2591        {        {
2592          const uschar *oldptr;
2593    
2594  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2595        if (utf8 && c > 127)        if (utf8 && c > 127)
2596          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2602  for (;; ptr++)
2602    
2603        if (inescq)        if (inescq)
2604          {          {
2605          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2606            {            {
2607            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2608            ptr++;            ptr++;                            /* Skip the 'E' */
2609            continue;            continue;                         /* Carry on with next */
2610            }            }
2611          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2612          }          }
2613    
2614        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2622  for (;; ptr++)
2622            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2623          {          {
2624          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2625          int posix_class, i;          int posix_class, taboffset, tabopt;
2626          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2627            uschar pbits[32];
2628    
2629          if (ptr[1] != ':')          if (ptr[1] != ':')
2630            {            {
# Line 1836  for (;; ptr++) Line 2653  for (;; ptr++)
2653          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2654            posix_class = 0;            posix_class = 0;
2655    
2656          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2657          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2658          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2659          white space chars afterwards. */          result into the bit map that is being built. */
2660    
2661          posix_class *= 3;          posix_class *= 3;
2662          for (i = 0; i < 3; i++)  
2663            /* Copy in the first table (always present) */
2664    
2665            memcpy(pbits, cbits + posix_class_maps[posix_class],
2666              32 * sizeof(uschar));
2667    
2668            /* If there is a second table, add or remove it as required. */
2669    
2670            taboffset = posix_class_maps[posix_class + 1];
2671            tabopt = posix_class_maps[posix_class + 2];
2672    
2673            if (taboffset >= 0)
2674            {            {
2675            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2676            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2677            else            else
2678              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2679            }            }
2680    
2681            /* Not see if we need to remove any special characters. An option
2682            value of 1 removes vertical space and 2 removes underscore. */
2683    
2684            if (tabopt < 0) tabopt = -tabopt;
2685            if (tabopt == 1) pbits[1] &= ~0x3c;
2686              else if (tabopt == 2) pbits[11] &= 0x7f;
2687    
2688            /* Add the POSIX table or its complement into the main table that is
2689            being built and we are done. */
2690    
2691            if (local_negate)
2692              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2693            else
2694              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2695    
2696          ptr = tempptr + 1;          ptr = tempptr + 1;
2697          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2698          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2699          }          }
2700    
2701        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2702        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2703        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2704        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2705        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2706        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2707    
2708        if (c == '\\')        if (c == '\\')
2709          {          {
2710          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2711            if (*errorcodeptr != 0) goto FAILED;
2712    
2713          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2714          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2715            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2716          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2717            {            {
2718            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1890  for (;; ptr++) Line 2722  for (;; ptr++)
2722            else inescq = TRUE;            else inescq = TRUE;
2723            continue;            continue;
2724            }            }
2725            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2726    
2727          if (c < 0)          if (c < 0)
2728            {            {
2729            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2730            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2731            switch (-c)  
2732              /* Save time by not doing this in the pre-compile phase. */
2733    
2734              if (lengthptr == NULL) switch (-c)
2735              {              {
2736              case ESC_d:              case ESC_d:
2737              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2759  for (;; ptr++)
2759              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2760              continue;              continue;
2761    
2762  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2763              case ESC_p:              continue;
2764              case ESC_P:  
2765                default:    /* Not recognized; fall through */
2766                break;      /* Need "default" setting to stop compiler warning. */
2767                }
2768    
2769              /* In the pre-compile phase, just do the recognition. */
2770    
2771              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2772                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2773    
2774              /* We need to deal with \H, \h, \V, and \v in both phases because
2775              they use extra memory. */
2776    
2777              if (-c == ESC_h)
2778                {
2779                SETBIT(classbits, 0x09); /* VT */
2780                SETBIT(classbits, 0x20); /* SPACE */
2781                SETBIT(classbits, 0xa0); /* NSBP */
2782    #ifdef SUPPORT_UTF8
2783                if (utf8)
2784                {                {
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
2785                class_utf8 = TRUE;                class_utf8 = TRUE;
2786                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2787                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2788                *class_utf8data++ = property;                *class_utf8data++ = XCL_SINGLE;
2789                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2790                  *class_utf8data++ = XCL_RANGE;
2791                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2792                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2793                  *class_utf8data++ = XCL_SINGLE;
2794                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2795                  *class_utf8data++ = XCL_SINGLE;
2796                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2797                  *class_utf8data++ = XCL_SINGLE;
2798                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2799                }                }
             continue;  
2800  #endif  #endif
2801                continue;
2802                }
2803    
2804              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2805              strict mode. By default, for compatibility with Perl, they are              {
2806              treated as literals. */              for (c = 0; c < 32; c++)
2807                  {
2808                  int x = 0xff;
2809                  switch (c)
2810                    {
2811                    case 0x09/8: x ^= 1 << (0x09%8); break;
2812                    case 0x20/8: x ^= 1 << (0x20%8); break;
2813                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2814                    default: break;
2815                    }
2816                  classbits[c] |= x;
2817                  }
2818    
2819              default:  #ifdef SUPPORT_UTF8
2820              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2821                {                {
2822                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2823                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2824                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2825                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2826                  *class_utf8data++ = XCL_RANGE;
2827                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2828                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2829                  *class_utf8data++ = XCL_RANGE;
2830                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2831                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2832                  *class_utf8data++ = XCL_RANGE;
2833                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2834                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2835                  *class_utf8data++ = XCL_RANGE;
2836                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2837                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2838                  *class_utf8data++ = XCL_RANGE;
2839                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2840                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2841                  *class_utf8data++ = XCL_RANGE;
2842                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2843                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844                }                }
2845              c = *ptr;              /* The final character */  #endif
2846              class_charcount -= 2;  /* Undo the default count from above */              continue;
2847              }              }
2848            }  
2849              if (-c == ESC_v)
2850          /* Fall through if we have a single character (c >= 0). This may be              {
2851          > 256 in UTF-8 mode. */              SETBIT(classbits, 0x0a); /* LF */
2852                SETBIT(classbits, 0x0b); /* VT */
2853          }   /* End of backslash handling */              SETBIT(classbits, 0x0c); /* FF */
2854                SETBIT(classbits, 0x0d); /* CR */
2855                SETBIT(classbits, 0x85); /* NEL */
2856    #ifdef SUPPORT_UTF8
2857                if (utf8)
2858                  {
2859                  class_utf8 = TRUE;
2860                  *class_utf8data++ = XCL_RANGE;
2861                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2862                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2863                  }
2864    #endif
2865                continue;
2866                }
2867    
2868              if (-c == ESC_V)
2869                {
2870                for (c = 0; c < 32; c++)
2871                  {
2872                  int x = 0xff;
2873                  switch (c)
2874                    {
2875                    case 0x0a/8: x ^= 1 << (0x0a%8);
2876                                 x ^= 1 << (0x0b%8);
2877                                 x ^= 1 << (0x0c%8);
2878                                 x ^= 1 << (0x0d%8);
2879                                 break;
2880                    case 0x85/8: x ^= 1 << (0x85%8); break;
2881                    default: break;
2882                    }
2883                  classbits[c] |= x;
2884                  }
2885    
2886    #ifdef SUPPORT_UTF8
2887                if (utf8)
2888                  {
2889                  class_utf8 = TRUE;
2890                  *class_utf8data++ = XCL_RANGE;
2891                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2892                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2893                  *class_utf8data++ = XCL_RANGE;
2894                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2895                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2896                  }
2897    #endif
2898                continue;
2899                }
2900    
2901              /* We need to deal with \P and \p in both phases. */
2902    
2903    #ifdef SUPPORT_UCP
2904              if (-c == ESC_p || -c == ESC_P)
2905                {
2906                BOOL negated;
2907                int pdata;
2908                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2909                if (ptype < 0) goto FAILED;
2910                class_utf8 = TRUE;
2911                *class_utf8data++ = ((-c == ESC_p) != negated)?
2912                  XCL_PROP : XCL_NOTPROP;
2913                *class_utf8data++ = ptype;
2914                *class_utf8data++ = pdata;
2915                class_charcount -= 2;   /* Not a < 256 character */
2916                continue;
2917                }
2918    #endif
2919              /* Unrecognized escapes are faulted if PCRE is running in its
2920              strict mode. By default, for compatibility with Perl, they are
2921              treated as literals. */
2922    
2923              if ((options & PCRE_EXTRA) != 0)
2924                {
2925                *errorcodeptr = ERR7;
2926                goto FAILED;
2927                }
2928    
2929              class_charcount -= 2;  /* Undo the default count from above */
2930              c = *ptr;              /* Get the final character and fall through */
2931              }
2932    
2933            /* Fall through if we have a single character (c >= 0). This may be
2934            greater than 256 in UTF-8 mode. */
2935    
2936            }   /* End of backslash handling */
2937    
2938        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2939        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2940        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2941          entirely. The code for handling \Q and \E is messy. */
2942    
2943          CHECK_RANGE:
2944          while (ptr[1] == '\\' && ptr[2] == 'E')
2945            {
2946            inescq = FALSE;
2947            ptr += 2;
2948            }
2949    
2950        if (ptr[1] == '-' && ptr[2] != ']')        oldptr = ptr;
2951    
2952          if (!inescq && ptr[1] == '-')
2953          {          {
2954          int d;          int d;
2955          ptr += 2;          ptr += 2;
2956            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2957    
2958            /* If we hit \Q (not followed by \E) at this point, go into escaped
2959            mode. */
2960    
2961            while (*ptr == '\\' && ptr[1] == 'Q')
2962              {
2963              ptr += 2;
2964              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2965              inescq = TRUE;
2966              break;
2967              }
2968    
2969            if (*ptr == 0 || (!inescq && *ptr == ']'))
2970              {
2971              ptr = oldptr;
2972              goto LONE_SINGLE_CHARACTER;
2973              }
2974    
2975  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2976          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2985  for (;; ptr++)
2985          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2986          in such circumstances. */          in such circumstances. */
2987    
2988          if (d == '\\')          if (!inescq && d == '\\')
2989            {            {
2990            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2991            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2992    
2993            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2994            was literal */            special means the '-' was literal */
2995    
2996            if (d < 0)            if (d < 0)
2997              {              {
2998              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2999              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3000                else if (d == -ESC_R) d = 'R'; else
3001                {                {
3002                ptr = oldptr - 2;                ptr = oldptr;
3003                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3004                }                }
3005              }              }
3006            }            }
3007    
3008          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3009          the pre-pass. Optimize one-character ranges */          one-character ranges */
3010    
3011            if (d < c)
3012              {
3013              *errorcodeptr = ERR8;
3014              goto FAILED;
3015              }
3016    
3017          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3018    
# Line 2022  for (;; ptr++) Line 3033  for (;; ptr++)
3033  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3034            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3035              {              {
3036              int occ, ocd;              unsigned int occ, ocd;
3037              int cc = c;              unsigned int cc = c;
3038              int origd = d;              unsigned int origd = d;
3039              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3040                {                {
3041                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3042                      ocd <= (unsigned int)d)
3043                    continue;                          /* Skip embedded ranges */
3044    
3045                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3046                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3047                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3048                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3049                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3050                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3051                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3052                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3053                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3054                  d = ocd;                  d = ocd;
3055                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3097  for (;; ptr++)
3097          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3098          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3099    
3100          for (; c <= d; c++)          class_charcount += d - c + 1;
3101            class_lastchar = d;
3102    
3103            /* We can save a bit of time by skipping this in the pre-compile. */
3104    
3105            if (lengthptr == NULL) for (; c <= d; c++)
3106            {            {
3107            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3108            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3110  for (;; ptr++)
3110              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3111              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3112              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3113            }            }
3114    
3115          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3133  for (;; ptr++)
3133  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3134          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3135            {            {
3136            int chartype;            unsigned int othercase;
3137            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3138              {              {
3139              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3140              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3159  for (;; ptr++)
3159          }          }
3160        }        }
3161    
3162      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3163      loop. This "while" is the end of the "do" above. */  
3164        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3165    
3166      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
3167          {
3168          *errorcodeptr = ERR6;
3169          goto FAILED;
3170          }
3171    
3172      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3173      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3231  for (;; ptr++)
3231    
3232      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3233      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3234      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3235    
3236  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3237      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3241  for (;; ptr++)
3241        code += LINK_SIZE;        code += LINK_SIZE;
3242        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3243    
3244        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3245        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3246    
3247        if (class_charcount > 0)        if (class_charcount > 0)
3248          {          {
3249          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3250            memmove(code + 32, code, class_utf8data - code);
3251          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3252          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3253          }          }
3254          else code = class_utf8data;
3255    
3256        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3257    
# Line 2254  for (;; ptr++) Line 3268  for (;; ptr++)
3268      if (negate_class)      if (negate_class)
3269        {        {
3270        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3271        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3272            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3273        }        }
3274      else      else
3275        {        {
# Line 2264  for (;; ptr++) Line 3279  for (;; ptr++)
3279      code += 32;      code += 32;
3280      break;      break;
3281    
3282    
3283        /* ===================================================================*/
3284      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3285      has been tested above. */      has been tested above. */
3286    
# Line 2331  for (;; ptr++) Line 3348  for (;; ptr++)
3348        }        }
3349      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3350    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3351      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3352      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3353      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3381  for (;; ptr++)
3381          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3382          }          }
3383    
3384          /* If the repetition is unlimited, it pays to see if the next thing on
3385          the line is something that cannot possibly match this character. If so,
3386          automatically possessifying this item gains some performance in the case
3387          where the match fails. */
3388    
3389          if (!possessive_quantifier &&
3390              repeat_max < 0 &&
3391              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3392                options, cd))
3393            {
3394            repeat_type = 0;    /* Force greedy */
3395            possessive_quantifier = TRUE;
3396            }
3397    
3398        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3399        }        }
3400    
3401      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3402      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3403      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3404      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3405        currently used only for single-byte chars. */
3406    
3407      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3408        {        {
3409        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3410        c = previous[1];        c = previous[1];
3411          if (!possessive_quantifier &&
3412              repeat_max < 0 &&
3413              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3414            {
3415            repeat_type = 0;    /* Force greedy */
3416            possessive_quantifier = TRUE;
3417            }
3418        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3419        }        }
3420    
# Line 2403  for (;; ptr++) Line 3428  for (;; ptr++)
3428      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3429        {        {
3430        uschar *oldcode;        uschar *oldcode;
3431        int prop_type;        int prop_type, prop_value;
3432        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3433        c = *previous;        c = *previous;
3434    
3435          if (!possessive_quantifier &&
3436              repeat_max < 0 &&
3437              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3438            {
3439            repeat_type = 0;    /* Force greedy */
3440            possessive_quantifier = TRUE;
3441            }
3442    
3443        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3444        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3445          previous[1] : -1;          {
3446            prop_type = previous[1];
3447            prop_value = previous[2];
3448            }
3449          else prop_type = prop_value = -1;
3450    
3451        oldcode = code;        oldcode = code;
3452        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3480  for (;; ptr++)
3480          }          }
3481    
3482        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3483        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3484        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3485        one less than the maximum. */        one less than the maximum. */
3486    
# Line 2470  for (;; ptr++) Line 3507  for (;; ptr++)
3507    
3508          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3509          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3510          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3511          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3512          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3513    
# Line 2486  for (;; ptr++) Line 3523  for (;; ptr++)
3523  #endif  #endif
3524              {              {
3525              *code++ = c;              *code++ = c;
3526              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3527                  {
3528                  *code++ = prop_type;
3529                  *code++ = prop_value;
3530                  }
3531              }              }
3532            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3533            }            }
3534    
3535          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3536          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3537            UPTO is just for 1 instance, we can use QUERY instead. */
3538    
3539          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3540            {            {
# Line 2505  for (;; ptr++) Line 3547  for (;; ptr++)
3547            else            else
3548  #endif  #endif
3549            *code++ = c;            *code++ = c;
3550            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3551                {
3552                *code++ = prop_type;
3553                *code++ = prop_value;
3554                }
3555            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3556            *code++ = OP_UPTO + repeat_type;  
3557            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3558                {
3559                *code++ = OP_QUERY + repeat_type;
3560                }
3561              else
3562                {
3563                *code++ = OP_UPTO + repeat_type;
3564                PUT2INC(code, 0, repeat_max);
3565                }
3566            }            }
3567          }          }
3568    
# Line 2524  for (;; ptr++) Line 3578  for (;; ptr++)
3578  #endif  #endif
3579        *code++ = c;        *code++ = c;
3580    
3581        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3582        defines the required property. */        define the required property. */
3583    
3584  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3585        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3586            {
3587            *code++ = prop_type;
3588            *code++ = prop_value;
3589            }
3590  #endif  #endif
3591        }        }
3592    
# Line 2571  for (;; ptr++) Line 3629  for (;; ptr++)
3629      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3630      cases. */      cases. */
3631    
3632      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3633               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3634        {        {
3635        register int i;        register int i;
3636        int ketoffset = 0;        int ketoffset = 0;
3637        int len = code - previous;        int len = code - previous;
3638        uschar *bralink = NULL;        uschar *bralink = NULL;
3639    
3640          /* Repeating a DEFINE group is pointless */
3641    
3642          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3643            {
3644            *errorcodeptr = ERR55;
3645            goto FAILED;
3646            }
3647    
3648        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3649        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3650        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3679  for (;; ptr++)
3679          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3680          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3681          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3682          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3683          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3684            doing this. */
3685    
3686          if (repeat_max <= 1)          if (repeat_max <= 1)
3687            {            {
3688            *code = OP_END;            *code = OP_END;
3689            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3690            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3691            code++;            code++;
3692            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3704  for (;; ptr++)
3704            {            {
3705            int offset;            int offset;
3706            *code = OP_END;            *code = OP_END;
3707            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3708            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3709            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3710            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3724  for (;; ptr++)
3724        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3725        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3726        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3727        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3728          forward reference subroutine calls in the group, there will be entries on
3729          the workspace list; replicate these with an appropriate increment. */
3730    
3731        else        else
3732          {          {
3733          if (repeat_min > 1)          if (repeat_min > 1)
3734            {            {
3735            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3736            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3737              potential integer overflow. */
3738    
3739              if (lengthptr != NULL)
3740                {
3741                int delta = (repeat_min - 1)*length_prevgroup;
3742                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3743                                                                (double)INT_MAX ||
3744                    OFLOW_MAX - *lengthptr < delta)
3745                  {
3746                  *errorcodeptr = ERR20;
3747                  goto FAILED;
3748                  }
3749                *lengthptr += delta;
3750                }
3751    
3752              /* This is compiling for real */
3753    
3754              else
3755              {              {
3756              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3757              code += len;              for (i = 1; i < repeat_min; i++)
3758                  {
3759                  uschar *hc;
3760                  uschar *this_hwm = cd->hwm;
3761                  memcpy(code, previous, len);
3762                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3763                    {
3764                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3765                    cd->hwm += LINK_SIZE;
3766                    }
3767                  save_hwm = this_hwm;
3768                  code += len;
3769                  }
3770              }              }
3771            }            }
3772    
3773          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3774          }          }
3775    
# Line 2677  for (;; ptr++) Line 3777  for (;; ptr++)
3777        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3778        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3779        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3780        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3781          replicate entries on the forward reference list. */
3782    
3783        if (repeat_max >= 0)        if (repeat_max >= 0)
3784          {          {
3785          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3786            just adjust the length as if we had. For each repetition we must add 1
3787            to the length for BRAZERO and for all but the last repetition we must
3788            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3789            paranoid checks to avoid integer overflow. */
3790    
3791            if (lengthptr != NULL && repeat_max > 0)
3792              {
3793              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3794                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3795              if ((double)repeat_max *
3796                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3797                      > (double)INT_MAX ||
3798                  OFLOW_MAX - *lengthptr < delta)
3799                {
3800                *errorcodeptr = ERR20;
3801                goto FAILED;
3802                }
3803              *lengthptr += delta;
3804              }
3805    
3806            /* This is compiling for real */
3807    
3808            else for (i = repeat_max - 1; i >= 0; i--)
3809            {            {
3810              uschar *hc;
3811              uschar *this_hwm = cd->hwm;
3812    
3813            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3814    
3815            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3825  for (;; ptr++)
3825              }              }
3826    
3827            memcpy(code, previous, len);            memcpy(code, previous, len);
3828              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3829                {
3830                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3831                cd->hwm += LINK_SIZE;
3832                }
3833              save_hwm = this_hwm;
3834            code += len;            code += len;
3835            }            }
3836    
# Line 2720  for (;; ptr++) Line 3853  for (;; ptr++)
3853        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3854        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3855        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3856        correct offset was computed above. */        correct offset was computed above.
3857    
3858        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3859          this group is a non-atomic one that could match an empty string. If so,
3860          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3861          that runtime checking can be done. [This check is also applied to
3862          atomic groups at runtime, but in a different way.] */
3863    
3864          else
3865            {
3866            uschar *ketcode = code - ketoffset;
3867            uschar *bracode = ketcode - GET(ketcode, 1);
3868            *ketcode = OP_KETRMAX + repeat_type;
3869            if (lengthptr == NULL && *bracode != OP_ONCE)
3870              {
3871              uschar *scode = bracode;
3872              do
3873                {
3874                if (could_be_empty_branch(scode, ketcode, utf8))
3875                  {
3876                  *bracode += OP_SBRA - OP_BRA;
3877                  break;
3878                  }
3879                scode += GET(scode, 1);
3880                }
3881              while (*scode == OP_ALT);
3882              }
3883            }
3884        }        }
3885    
3886      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3891  for (;; ptr++)
3891        goto FAILED;        goto FAILED;
3892        }        }
3893    
3894      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3895      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3896      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3897      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3898      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3899        but the special opcodes can optimize it a bit. The repeated item starts at
3900        tempcode, not at previous, which might be the first part of a string whose
3901        (former) last char we repeated.
3902    
3903        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3904        an 'upto' may follow. We skip over an 'exact' item, and then test the
3905        length of what remains before proceeding. */
3906    
3907      if (possessive_quantifier)      if (possessive_quantifier)
3908        {        {
3909        int len = code - tempcode;        int len;
3910        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3911        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3912        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3913        tempcode[0] = OP_ONCE;        len = code - tempcode;
3914        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3915        PUTINC(code, 0, len);          {
3916        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3917            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3918            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3919            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3920    
3921            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3922            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3923            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3924            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3925    
3926            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3927            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3928            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3929            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3930    
3931            default:
3932            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3933            code += 1 + LINK_SIZE;
3934            len += 1 + LINK_SIZE;
3935            tempcode[0] = OP_ONCE;
3936            *code++ = OP_KET;
3937            PUTINC(code, 0, len);
3938            PUT(tempcode, 1, len);
3939            break;
3940            }
3941        }        }
3942    
3943      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3950  for (;; ptr++)
3950      break;      break;
3951    
3952    
3953      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3954      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3955      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3956      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3957    
3958      case '(':      case '(':
3959      newoptions = options;      newoptions = options;
3960      skipbytes = 0;      skipbytes = 0;
3961        bravalue = OP_CBRA;
3962        save_hwm = cd->hwm;
3963        reset_bracount = FALSE;
3964    
3965        /* First deal with various "verbs" that can be introduced by '*'. */
3966    
3967        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3968          {
3969          int i, namelen;
3970          const uschar *name = ++ptr;
3971          previous = NULL;
3972          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3973          if (*ptr == ':')
3974            {
3975            *errorcodeptr = ERR59;   /* Not supported */
3976            goto FAILED;
3977            }
3978          if (*ptr != ')')
3979            {
3980            *errorcodeptr = ERR60;
3981            goto FAILED;
3982            }
3983          namelen = ptr - name;
3984          for (i = 0; i < verbcount; i++)
3985            {
3986            if (namelen == verbs[i].len &&
3987                strncmp((char *)name, verbs[i].name, namelen) == 0)
3988              {
3989              *code = verbs[i].op;
3990              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3991              break;
3992              }
3993            }
3994          if (i < verbcount) continue;
3995          *errorcodeptr = ERR60;
3996          goto FAILED;
3997          }
3998    
3999      if (*(++ptr) == '?')      /* Deal with the extended parentheses; all are introduced by '?', and the
4000        appearance of any of them means that this is not a capturing group. */
4001    
4002        else if (*ptr == '?')
4003        {        {
4004        int set, unset;        int i, set, unset, namelen;
4005        int *optset;        int *optset;
4006          const uschar *name;
4007          uschar *slot;
4008    
4009        switch (*(++ptr))        switch (*(++ptr))
4010          {          {
4011          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4012          ptr++;          ptr++;
4013          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4014            if (*ptr == 0)
4015              {
4016              *errorcodeptr = ERR18;
4017              goto FAILED;
4018              }
4019          continue;          continue;
4020    
4021          case ':':                 /* Non-extracting bracket */  
4022            /* ------------------------------------------------------------ */
4023            case '|':                 /* Reset capture count for each branch */
4024            reset_bracount = TRUE;
4025            /* Fall through */
4026    
4027            /* ------------------------------------------------------------ */
4028            case ':':                 /* Non-capturing bracket */
4029          bravalue = OP_BRA;          bravalue = OP_BRA;
4030          ptr++;          ptr++;
4031          break;          break;
4032    
4033    
4034            /* ------------------------------------------------------------ */
4035          case '(':          case '(':
4036          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4037    
4038          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4039            group), a name (referring to a named group), or 'R', referring to
4040            recursion. R<digits> and R&name are also permitted for recursion tests.
4041    
4042            There are several syntaxes for testing a named group: (?(name)) is used
4043            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4044    
4045            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4046            be the recursive thing or the name 'R' (and similarly for 'R' followed
4047            by digits), and (b) a number could be a name that consists of digits.
4048            In both cases, we look for a name first; if not found, we try the other
4049            cases. */
4050    
4051            /* For conditions that are assertions, check the syntax, and then exit
4052            the switch. This will take control down to where bracketed groups,
4053            including assertions, are processed. */
4054    
4055            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4056              break;
4057    
4058            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4059            below), and all need to skip 3 bytes at the start of the group. */
4060    
4061            code[1+LINK_SIZE] = OP_CREF;
4062            skipbytes = 3;
4063            refsign = -1;
4064    
4065            /* Check for a test for recursion in a named group. */
4066    
4067          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
4068            {            {
4069            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4070            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4071            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4072            }            }
4073    
4074          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4075          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4076    
4077          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4078            {            {
4079            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4080            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4081            }            }
4082          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
         set bravalue above. */  
         break;  
   
         case '=':                 /* Positive lookahead */  
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
   
         case '!':                 /* Negative lookahead */  
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
   
         case '<':                 /* Lookbehinds */  
         switch (*(++ptr))  
4083            {            {
4084            case '=':               /* Positive lookbehind */            terminator = '\'';
           bravalue = OP_ASSERTBACK;  
4085            ptr++;            ptr++;
4086            break;            }
4087            else
4088              {
4089              terminator = 0;
4090              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4091              }
4092    
4093            case '!':               /* Negative lookbehind */          /* We now expect to read a name; any thing else is an error */
4094            bravalue = OP_ASSERTBACK_NOT;  
4095            ptr++;          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4096            break;            {
4097              ptr += 1;  /* To get the right offset */
4098              *errorcodeptr = ERR28;
4099              goto FAILED;
4100            }            }
         break;  
4101    
4102          case '>':                 /* One-time brackets */          /* Read the name, but also get it as a number if it's all digits */
         bravalue = OP_ONCE;  
         ptr++;  
         break;  
4103    
4104          case 'C':                 /* Callout - may be followed by digits; */          recno = 0;
4105          previous_callout = code;  /* Save for later completion */          name = ++ptr;
4106          after_manual_callout = 1; /* Skip one item before completing */          while ((cd->ctypes[*ptr] & ctype_word) != 0)
4107          *code++ = OP_CALLOUT;     /* Already checked that the terminating */            {
4108            {                       /* closing parenthesis is present. */            if (recno >= 0)
4109            int n = 0;              recno = ((digitab[*ptr] & ctype_digit) != 0)?
4110            while ((digitab[*(++ptr)] & ctype_digit) != 0)                recno * 10 + *ptr - '0' : -1;
4111              n = n * 10 + *ptr - '0';            ptr++;
           if (n > 255)  
             {  
             *errorcodeptr = ERR38;  
             goto FAILED;  
             }  
           *code++ = n;  
           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */  
           PUT(code, LINK_SIZE, 0);                    /* Default length */  
           code += 2 * LINK_SIZE;  
4112            }            }
4113          previous = NULL;          namelen = ptr - name;
         continue;  
4114    
4115          case 'P':                 /* Named subpattern handling */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
         if (*(++ptr) == '<')      /* Definition */  
4116            {            {
4117            int i, namelen;            ptr--;      /* Error offset */
4118            uschar *slot = cd->name_table;            *errorcodeptr = ERR26;
4119            const uschar *name;     /* Don't amalgamate; some compilers */            goto FAILED;
4120            name = ++ptr;           /* grumble at autoincrement in declaration */            }
4121    
4122            while (*ptr++ != '>');          /* Do no further checking in the pre-compile phase. */
           namelen = ptr - name - 1;  
4123    
4124            for (i = 0; i < cd->names_found; i++)          if (lengthptr != NULL) break;
4125    
4126            /* In the real compile we do the work of looking for the actual
4127            reference. If the string started with "+" or "-" we require the rest to
4128            be digits, in which case recno will be set. */
4129    
4130            if (refsign > 0)
4131              {
4132              if (recno <= 0)
4133              {              {
4134              int crc = memcmp(name, slot+2, namelen);              *errorcodeptr = ERR58;
4135              if (crc == 0)              goto FAILED;
4136                {              }
4137                if (slot[2+namelen] == 0)            if (refsign == '-')
4138                  {              {
4139                  *errorcodeptr = ERR43;              recno = cd->bracount - recno + 1;
4140                  goto FAILED;              if (recno <= 0)
                 }  
               crc = -1;             /* Current name is substring */  
               }  
             if (crc < 0)  
4141                {                {
4142                memmove(slot + cd->name_entry_size, slot,                *errorcodeptr = ERR15;
4143                  (cd->names_found - i) * cd->name_entry_size);                goto FAILED;
               break;  
4144                }                }
             slot += cd->name_entry_size;  
4145              }              }
4146              else recno += cd->bracount;
4147            PUT2(slot, 0, *brackets + 1);            PUT2(code, 2+LINK_SIZE, recno);
4148            memcpy(slot + 2, name, namelen);            break;
           slot[2+namelen] = 0;  
           cd->names_found++;  
           goto NUMBERED_GROUP;  
4149            }            }
4150    
4151          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4152            name. */
4153    
4154            slot = cd->name_table;
4155            for (i = 0; i < cd->names_found; i++)
4156            {            {
4157            int i, namelen;            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4158            int type = *ptr++;            slot += cd->name_entry_size;
4159            const uschar *name = ptr;            }
           uschar *slot = cd->name_table;  
4160    
4161            while (*ptr != ')') ptr++;          /* Found a previous named subpattern */
           namelen = ptr - name;  
4162    
4163            for (i = 0; i < cd->names_found; i++)          if (i < cd->names_found)
4164              {            {
4165              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;            recno = GET2(slot, 0);
4166              slot += cd->name_entry_size;            PUT2(code, 2+LINK_SIZE, recno);
4167              }            }
4168            if (i >= cd->names_found)  
4169            /* Search the pattern for a forward reference */
4170    
4171            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4172                            (options & PCRE_EXTENDED) != 0)) > 0)
4173              {
4174              PUT2(code, 2+LINK_SIZE, i);
4175              }
4176    
4177            /* If terminator == 0 it means that the name followed directly after
4178            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4179            some further alternatives to try. For the cases where terminator != 0
4180            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4181            now checked all the possibilities, so give an error. */
4182    
4183            else if (terminator != 0)
4184              {
4185              *errorcodeptr = ERR15;
4186              goto FAILED;
4187              }
4188    
4189            /* Check for (?(R) for recursion. Allow digits after R to specify a
4190            specific group number. */
4191    
4192            else if (*name == 'R')
4193              {
4194              recno = 0;
4195              for (i = 1; i < namelen; i++)
4196              {              {
4197              *errorcodeptr = ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
4198              goto FAILED;                {
4199                  *errorcodeptr = ERR15;
4200                  goto FAILED;
4201                  }
4202                recno = recno * 10 + name[i] - '0';
4203              }              }
4204              if (recno == 0) recno = RREF_ANY;
4205              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4206              PUT2(code, 2+LINK_SIZE, recno);
4207              }
4208    
4209            recno = GET2(slot, 0);          /* Similarly, check for the (?(DEFINE) "condition", which is always
4210            false. */
4211    
4212            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4213              {
4214              code[1+LINK_SIZE] = OP_DEF;
4215              skipbytes = 1;
4216              }
4217    
4218            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* Check for the "name" actually being a subpattern number. */
4219    
4220            /* Back reference */          else if (recno > 0)
4221              {
4222              PUT2(code, 2+LINK_SIZE, recno);
4223              }
4224    
4225            previous = code;          /* Either an unidentified subpattern, or a reference to (?(0) */
4226            *code++ = OP_REF;  
4227            PUT2INC(code, 0, recno);          else
4228            cd->backref_map |= (recno < 32)? (1 << recno) : 1;            {
4229            if (recno > cd->top_backref) cd->top_backref = recno;            *errorcodeptr = (recno == 0)? ERR35: ERR15;
4230              goto FAILED;
4231              }
4232            break;
4233    
4234    
4235            /* ------------------------------------------------------------ */
4236            case '=':                 /* Positive lookahead */
4237            bravalue = OP_ASSERT;
4238            ptr++;
4239            break;
4240    
4241    
4242            /* ------------------------------------------------------------ */
4243            case '!':                 /* Negative lookahead */
4244            ptr++;
4245            if (*ptr == ')')          /* Optimize (?!) */
4246              {
4247              *code++ = OP_FAIL;
4248              previous = NULL;
4249            continue;            continue;
4250            }            }
4251            bravalue = OP_ASSERT_NOT;
4252            break;
4253    
4254    
4255            /* ------------------------------------------------------------ */
4256            case '<':                 /* Lookbehind or named define */
4257            switch (ptr[1])
4258              {
4259              case '=':               /* Positive lookbehind */
4260              bravalue = OP_ASSERTBACK;
4261              ptr += 2;
4262              break;
4263    
4264              case '!':               /* Negative lookbehind */
4265              bravalue = OP_ASSERTBACK_NOT;
4266              ptr += 2;
4267              break;
4268    
4269              default:                /* Could be name define, else bad */
4270              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4271              ptr++;                  /* Correct offset for error */
4272              *errorcodeptr = ERR24;
4273              goto FAILED;
4274              }
4275            break;
4276    
4277    
4278          /* Should never happen */          /* ------------------------------------------------------------ */
4279            case '>':                 /* One-time brackets */
4280            bravalue = OP_ONCE;
4281            ptr++;
4282          break;          break;
4283    
4284          case 'R':                 /* Pattern recursion */  
4285            /* ------------------------------------------------------------ */
4286            case 'C':                 /* Callout - may be followed by digits; */
4287            previous_callout = code;  /* Save for later completion */
4288            after_manual_callout = 1; /* Skip one item before completing */
4289            *code++ = OP_CALLOUT;
4290              {
4291              int n = 0;
4292              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4293                n = n * 10 + *ptr - '0';
4294              if (*ptr != ')')
4295                {
4296                *errorcodeptr = ERR39;
4297                goto FAILED;
4298                }
4299              if (n > 255)
4300                {
4301                *errorcodeptr = ERR38;
4302                goto FAILED;
4303                }
4304              *code++ = n;
4305              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4306              PUT(code, LINK_SIZE, 0);                    /* Default length */
4307              code += 2 * LINK_SIZE;
4308              }
4309            previous = NULL;
4310            continue;
4311    
4312    
4313            /* ------------------------------------------------------------ */
4314            case 'P':                 /* Python-style named subpattern handling */
4315            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4316              {
4317              is_recurse = *ptr == '>';
4318              terminator = ')';
4319              goto NAMED_REF_OR_RECURSE;
4320              }
4321            else if (*ptr != '<')    /* Test for Python-style definition */
4322              {
4323              *errorcodeptr = ERR41;
4324              goto FAILED;
4325              }
4326            /* Fall through to handle (?P< as (?< is handled */
4327    
4328    
4329            /* ------------------------------------------------------------ */
4330            DEFINE_NAME:    /* Come here from (?< handling */
4331            case '\'':
4332              {
4333              terminator = (*ptr == '<')? '>' : '\'';
4334              name = ++ptr;
4335    
4336              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4337              namelen = ptr - name;
4338    
4339              /* In the pre-compile phase, just do a syntax check. */
4340    
4341              if (lengthptr != NULL)
4342                {
4343                if (*ptr != terminator)
4344                  {
4345                  *errorcodeptr = ERR42;
4346                  goto FAILED;
4347                  }
4348                if (cd->names_found >= MAX_NAME_COUNT)
4349                  {
4350                  *errorcodeptr = ERR49;
4351                  goto FAILED;
4352                  }
4353                if (namelen + 3 > cd->name_entry_size)
4354                  {
4355                  cd->name_entry_size = namelen + 3;
4356                  if (namelen > MAX_NAME_SIZE)
4357                    {
4358                    *errorcodeptr = ERR48;
4359                    goto FAILED;
4360                    }
4361                  }
4362                }
4363    
4364              /* In the real compile, create the entry in the table */
4365    
4366              else
4367                {
4368                slot = cd->name_table;
4369                for (i = 0; i < cd->names_found; i++)
4370                  {
4371                  int crc = memcmp(name, slot+2, namelen);
4372                  if (crc == 0)
4373                    {
4374                    if (slot[2+namelen] == 0)
4375                      {
4376                      if ((options & PCRE_DUPNAMES) == 0)
4377                        {
4378                        *errorcodeptr = ERR43;
4379                        goto FAILED;
4380                        }
4381                      }
4382                    else crc = -1;      /* Current name is substring */
4383                    }
4384                  if (crc < 0)
4385                    {
4386                    memmove(slot + cd->name_entry_size, slot,
4387                      (cd->names_found - i) * cd->name_entry_size);
4388                    break;
4389                    }
4390                  slot += cd->name_entry_size;
4391                  }
4392    
4393                PUT2(slot, 0, cd->bracount + 1);
4394                memcpy(slot + 2, name, namelen);
4395                slot[2+namelen] = 0;
4396                }
4397              }
4398    
4399            /* In both cases, count the number of names we've encountered. */
4400    
4401            ptr++;                    /* Move past > or ' */
4402            cd->names_found++;
4403            goto NUMBERED_GROUP;
4404    
4405    
4406            /* ------------------------------------------------------------ */
4407            case '&':                 /* Perl recursion/subroutine syntax */
4408            terminator = ')';
4409            is_recurse = TRUE;
4410            /* Fall through */
4411    
4412            /* We come here from the Python syntax above that handles both
4413            references (?P=name) and recursion (?P>name), as well as falling
4414            through from the Perl recursion syntax (?&name). */
4415    
4416            NAMED_REF_OR_RECURSE:
4417            name = ++ptr;
4418            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4419            namelen = ptr - name;
4420    
4421            /* In the pre-compile phase, do a syntax check and set a dummy
4422            reference number. */
4423    
4424            if (lengthptr != NULL)
4425              {
4426              if (*ptr != terminator)
4427                {
4428                *errorcodeptr = ERR42;
4429                goto FAILED;
4430                }
4431              if (namelen > MAX_NAME_SIZE)
4432                {
4433                *errorcodeptr = ERR48;
4434                goto FAILED;
4435                }
4436              recno = 0;
4437              }
4438    
4439            /* In the real compile, seek the name in the table */
4440    
4441            else
4442              {
4443              slot = cd->name_table;
4444              for (i = 0; i < cd->names_found; i++)
4445                {
4446                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4447                slot += cd->name_entry_size;
4448                }
4449    
4450              if (i < cd->names_found)         /* Back reference */
4451                {
4452                recno = GET2(slot, 0);
4453                }
4454              else if ((recno =                /* Forward back reference */
4455                        find_parens(ptr, cd->bracount, name, namelen,
4456                          (options & PCRE_EXTENDED) != 0)) <= 0)
4457                {
4458                *errorcodeptr = ERR15;
4459                goto FAILED;
4460                }
4461              }
4462    
4463            /* In both phases, we can now go to the code than handles numerical
4464            recursion or backreferences. */
4465    
4466            if (is_recurse) goto HANDLE_RECURSION;
4467              else goto HANDLE_REFERENCE;
4468    
4469    
4470            /* ------------------------------------------------------------ */
4471            case 'R':                 /* Recursion */
4472          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4473          /* Fall through */          /* Fall through */
4474    
         /* Recursion or "subroutine" call */  
4475    
4476          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4477          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4478            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4479            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4480            {            {
4481            const uschar *called;            const uschar *called;
4482    
4483              if ((refsign = *ptr) == '+') ptr++;
4484              else if (refsign == '-')
4485                {
4486                if ((digitab[ptr[1]] & ctype_digit) == 0)
4487                  goto OTHER_CHAR_AFTER_QUERY;
4488                ptr++;
4489                }
4490    
4491            recno = 0;            recno = 0;
4492            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4493              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4494    
4495              if (*ptr != ')')
4496                {
4497                *errorcodeptr = ERR29;
4498                goto FAILED;
4499                }
4500    
4501              if (refsign == '-')
4502                {
4503                if (recno == 0)
4504                  {
4505                  *errorcodeptr = ERR58;
4506                  goto FAILED;
4507                  }
4508                recno = cd->bracount - recno + 1;
4509                if (recno <= 0)
4510                  {
4511                  *errorcodeptr = ERR15;
4512                  goto FAILED;
4513                  }
4514                }
4515              else if (refsign == '+')
4516                {
4517                if (recno == 0)
4518                  {
4519                  *errorcodeptr = ERR58;
4520                  goto FAILED;
4521                  }
4522                recno += cd->bracount;
4523                }
4524    
4525            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4526    
4527            HANDLE_RECURSION:            HANDLE_RECURSION:
4528    
4529            previous = code;            previous = code;
4530              called = cd->start_code;
4531    
4532            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4533            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4534              this point. If we end up with a forward reference, first check that
4535            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4536            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4537              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4538    
4539            if (called == NULL)            if (lengthptr == NULL)
4540              {              {
4541              *errorcodeptr = ERR15;              *code = OP_END;
4542              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4543    
4544            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4545    
4546            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4547              {                {
4548              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4549              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4550                    {
4551                    *errorcodeptr = ERR15;
4552                    goto FAILED;
4553                    }
4554                  called = cd->start_code + recno;
4555                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4556                  }
4557    
4558                /* If not a forward reference, and the subpattern is still open,
4559                this is a recursive call. We check to see if this is a left
4560                recursion that could loop for ever, and diagnose that case. */
4561    
4562                else if (GET(called, 1) == 0 &&
4563                         could_be_empty(called, code, bcptr, utf8))
4564                  {
4565                  *errorcodeptr = ERR40;
4566                  goto FAILED;
4567                  }
4568              }              }
4569    
4570            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4571              "once" brackets. Set up a "previous group" length so that a
4572              subsequent quantifier will work. */
4573    
4574              *code = OP_ONCE;
4575              PUT(code, 1, 2 + 2*LINK_SIZE);
4576              code += 1 + LINK_SIZE;
4577    
4578            *code = OP_RECURSE;            *code = OP_RECURSE;
4579            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4580            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4581    
4582              *code = OP_KET;
4583              PUT(code, 1, 2 + 2*LINK_SIZE);