/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 210 by ph10, Wed Aug 8 14:24:50 2007 UTC revision 793 by ph10, Wed Dec 7 16:52:34 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 43  supporting internal functions that are n Line 43  supporting internal functions that are n
43    
44    
45  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
46  #include <config.h>  #include "config.h"
47  #endif  #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 87  so this number is very generous. Line 88  so this number is very generous.
88  The same workspace is used during the second, actual compile phase for  The same workspace is used during the second, actual compile phase for
89  remembering forward references to groups so that they can be filled in at the  remembering forward references to groups so that they can be filled in at the
90  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE  end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91  is 4 there is plenty of room. */  is 4 there is plenty of room for most patterns. However, the memory can get
92    filled up by repetitions of forward references, for example patterns like
93    /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94    that the workspace is expanded using malloc() in this situation. The value
95    below is therefore a minimum, and we put a maximum on it for safety. The
96    minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97    kicks in at the same number of forward references in all cases. */
98    
99  #define COMPILE_WORK_SIZE (4096)  #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
100    #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
101    
102    /* The overrun tests check for a slightly smaller size so that they detect the
103    overrun before it actually does run off the end of the data block. */
104    
105    #define WORK_SIZE_SAFETY_MARGIN (100)
106    
107    
108  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 97  are simple data values; negative values Line 110  are simple data values; negative values
110  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
111  is invalid. */  is invalid. */
112    
113  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
114    
115    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
116    in UTF-8 mode. */
117    
118  static const short int escapes[] = {  static const short int escapes[] = {
119       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
120       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
121     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
122  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
123  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
124  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
125     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
126  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
127  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
128       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
129         -ESC_D,                  -ESC_E,
130         0,                       -ESC_G,
131         -ESC_H,                  0,
132         0,                       -ESC_K,
133         0,                       0,
134         -ESC_N,                  0,
135         -ESC_P,                  -ESC_Q,
136         -ESC_R,                  -ESC_S,
137         0,                       0,
138         -ESC_V,                  -ESC_W,
139         -ESC_X,                  0,
140         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
141         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
142         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
143         CHAR_GRAVE_ACCENT,       7,
144         -ESC_b,                  0,
145         -ESC_d,                  ESC_e,
146         ESC_f,                   0,
147         -ESC_h,                  0,
148         0,                       -ESC_k,
149         0,                       0,
150         ESC_n,                   0,
151         -ESC_p,                  0,
152         ESC_r,                   -ESC_s,
153         ESC_tee,                 0,
154         -ESC_v,                  -ESC_w,
155         0,                       0,
156         -ESC_z
157  };  };
158    
159  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
160    
161    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
162    
163  static const short int escapes[] = {  static const short int escapes[] = {
164  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
165  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 130  static const short int escapes[] = { Line 178  static const short int escapes[] = {
178  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
179  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
180  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
181  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
182  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
183  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
184  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
# Line 140  static const short int escapes[] = { Line 188  static const short int escapes[] = {
188  #endif  #endif
189    
190    
191  /* Table of special "verbs" like (*PRUNE) */  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
192    searched linearly. Put all the names into a single string, in order to reduce
193    the number of relocations when a shared library is dynamically linked. The
194    string is built from string macros so that it works in UTF-8 mode on EBCDIC
195    platforms. */
196    
197  typedef struct verbitem {  typedef struct verbitem {
198    const char *name;    int   len;                 /* Length of verb name */
199    int   len;    int   op;                  /* Op when no arg, or -1 if arg mandatory */
200    int   op;    int   op_arg;              /* Op when arg present, or -1 if not allowed */
201  } verbitem;  } verbitem;
202    
203  static verbitem verbs[] = {  static const char verbnames[] =
204    { "ACCEPT", 6, OP_ACCEPT },    "\0"                       /* Empty name is a shorthand for MARK */
205    { "COMMIT", 6, OP_COMMIT },    STRING_MARK0
206    { "F",      1, OP_FAIL },    STRING_ACCEPT0
207    { "FAIL",   4, OP_FAIL },    STRING_COMMIT0
208    { "PRUNE",  5, OP_PRUNE },    STRING_F0
209    { "SKIP",   4, OP_SKIP  },    STRING_FAIL0
210    { "THEN",   4, OP_THEN  }    STRING_PRUNE0
211      STRING_SKIP0
212      STRING_THEN;
213    
214    static const verbitem verbs[] = {
215      { 0, -1,        OP_MARK },
216      { 4, -1,        OP_MARK },
217      { 6, OP_ACCEPT, -1 },
218      { 6, OP_COMMIT, -1 },
219      { 1, OP_FAIL,   -1 },
220      { 4, OP_FAIL,   -1 },
221      { 5, OP_PRUNE,  OP_PRUNE_ARG },
222      { 4, OP_SKIP,   OP_SKIP_ARG  },
223      { 4, OP_THEN,   OP_THEN_ARG  }
224  };  };
225    
226  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
227    
228    
229  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The names are
230  terminated by a zero length entry. The first three must be alpha, lower, upper,  now all in a single string, to reduce the number of relocations when a shared
231  as this is assumed for handling case independence. */  library is dynamically loaded. The list of lengths is terminated by a zero
232    length entry. The first three must be alpha, lower, upper, as this is assumed
233  static const char *const posix_names[] = {  for handling case independence. */
234    "alpha", "lower", "upper",  
235    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  static const char posix_names[] =
236    "print", "punct", "space", "word",  "xdigit" };    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
237      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
238      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
239      STRING_word0  STRING_xdigit;
240    
241  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
242    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 200  static const int posix_class_maps[] = { Line 268  static const int posix_class_maps[] = {
268    cbit_xdigit,-1,          0              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
269  };  };
270    
271    /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
272    substitutes must be in the order of the names, defined above, and there are
273    both positive and negative cases. NULL means no substitute. */
274    
275    #ifdef SUPPORT_UCP
276    static const uschar *substitutes[] = {
277      (uschar *)"\\P{Nd}",    /* \D */
278      (uschar *)"\\p{Nd}",    /* \d */
279      (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
280      (uschar *)"\\p{Xsp}",   /* \s */
281      (uschar *)"\\P{Xwd}",   /* \W */
282      (uschar *)"\\p{Xwd}"    /* \w */
283    };
284    
285    static const uschar *posix_substitutes[] = {
286      (uschar *)"\\p{L}",     /* alpha */
287      (uschar *)"\\p{Ll}",    /* lower */
288      (uschar *)"\\p{Lu}",    /* upper */
289      (uschar *)"\\p{Xan}",   /* alnum */
290      NULL,                   /* ascii */
291      (uschar *)"\\h",        /* blank */
292      NULL,                   /* cntrl */
293      (uschar *)"\\p{Nd}",    /* digit */
294      NULL,                   /* graph */
295      NULL,                   /* print */
296      NULL,                   /* punct */
297      (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
298      (uschar *)"\\p{Xwd}",   /* word */
299      NULL,                   /* xdigit */
300      /* Negated cases */
301      (uschar *)"\\P{L}",     /* ^alpha */
302      (uschar *)"\\P{Ll}",    /* ^lower */
303      (uschar *)"\\P{Lu}",    /* ^upper */
304      (uschar *)"\\P{Xan}",   /* ^alnum */
305      NULL,                   /* ^ascii */
306      (uschar *)"\\H",        /* ^blank */
307      NULL,                   /* ^cntrl */
308      (uschar *)"\\P{Nd}",    /* ^digit */
309      NULL,                   /* ^graph */
310      NULL,                   /* ^print */
311      NULL,                   /* ^punct */
312      (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
313      (uschar *)"\\P{Xwd}",   /* ^word */
314      NULL                    /* ^xdigit */
315    };
316    #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
317    #endif
318    
319  #define STRING(a)  # a  #define STRING(a)  # a
320  #define XSTRING(s) STRING(s)  #define XSTRING(s) STRING(s)
# Line 207  static const int posix_class_maps[] = { Line 322  static const int posix_class_maps[] = {
322  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
323  are passed to the outside world. Do not ever re-use any error number, because  are passed to the outside world. Do not ever re-use any error number, because
324  they are documented. Always add a new error instead. Messages marked DEAD below  they are documented. Always add a new error instead. Messages marked DEAD below
325  are no longer used. */  are no longer used. This used to be a table of strings, but in order to reduce
326    the number of relocations needed when a shared library is loaded dynamically,
327  static const char *error_texts[] = {  it is now one long string. We cannot use a table of offsets, because the
328    "no error",  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
329    "\\ at end of pattern",  simply count through to the one we want - this isn't a performance issue
330    "\\c at end of pattern",  because these strings are used only when there is a compilation error.
331    "unrecognized character follows \\",  
332    "numbers out of order in {} quantifier",  Each substring ends with \0 to insert a null character. This includes the final
333    substring, so that the whole string ends with \0\0, which can be detected when
334    counting through. */
335    
336    static const char error_texts[] =
337      "no error\0"
338      "\\ at end of pattern\0"
339      "\\c at end of pattern\0"
340      "unrecognized character follows \\\0"
341      "numbers out of order in {} quantifier\0"
342    /* 5 */    /* 5 */
343    "number too big in {} quantifier",    "number too big in {} quantifier\0"
344    "missing terminating ] for character class",    "missing terminating ] for character class\0"
345    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
346    "range out of order in character class",    "range out of order in character class\0"
347    "nothing to repeat",    "nothing to repeat\0"
348    /* 10 */    /* 10 */
349    "operand of unlimited repeat could match the empty string",  /** DEAD **/    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
350    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
351    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
352    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
353    "missing )",    "missing )\0"
354    /* 15 */    /* 15 */
355    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
356    "erroffset passed as NULL",    "erroffset passed as NULL\0"
357    "unknown option bit(s) set",    "unknown option bit(s) set\0"
358    "missing ) after comment",    "missing ) after comment\0"
359    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply\0"  /** DEAD **/
360    /* 20 */    /* 20 */
361    "regular expression is too large",    "regular expression is too large\0"
362    "failed to get memory",    "failed to get memory\0"
363    "unmatched parentheses",    "unmatched parentheses\0"
364    "internal error: code overflow",    "internal error: code overflow\0"
365    "unrecognized character after (?<",    "unrecognized character after (?<\0"
366    /* 25 */    /* 25 */
367    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
368    "malformed number or name after (?(",    "malformed number or name after (?(\0"
369    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
370    "assertion expected after (?(",    "assertion expected after (?(\0"
371    "(?R or (?[+-]digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
372    /* 30 */    /* 30 */
373    "unknown POSIX class name",    "unknown POSIX class name\0"
374    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
375    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
376    "spare error",  /** DEAD **/    "spare error\0"  /** DEAD **/
377    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
378    /* 35 */    /* 35 */
379    "invalid condition (?(0)",    "invalid condition (?(0)\0"
380    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
381    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
382    "number after (?C is > 255",    "number after (?C is > 255\0"
383    "closing ) for (?C expected",    "closing ) for (?C expected\0"
384    /* 40 */    /* 40 */
385    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
386    "unrecognized character after (?P",    "unrecognized character after (?P\0"
387    "syntax error in subpattern name (missing terminator)",    "syntax error in subpattern name (missing terminator)\0"
388    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
389    "invalid UTF-8 string",    "invalid UTF-8 string\0"
390    /* 45 */    /* 45 */
391    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
392    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
393    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
394    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
395    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
396    /* 50 */    /* 50 */
397    "repeated subpattern is too long",    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
398    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 (not in UTF-8 mode)\0"
399    "internal error: overran compiling workspace",    "internal error: overran compiling workspace\0"
400    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found\0"
401    "DEFINE group contains more than one branch",    "DEFINE group contains more than one branch\0"
402    /* 55 */    /* 55 */
403    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed\0"  /** DEAD **/
404    "inconsistent NEWLINE options",    "inconsistent NEWLINE options\0"
405    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
406    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",    "a numbered reference must not be zero\0"
407    "(*VERB) with an argument is not supported",    "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
408    /* 60 */    /* 60 */
409    "(*VERB) not recognized"    "(*VERB) not recognized\0"
410  };    "number is too big\0"
411      "subpattern name expected\0"
412      "digit expected after (?+\0"
413      "] is an invalid data character in JavaScript compatibility mode\0"
414      /* 65 */
415      "different names for subpatterns of the same number are not allowed\0"
416      "(*MARK) must have an argument\0"
417      "this version of PCRE is not compiled with PCRE_UCP support\0"
418      "\\c must be followed by an ASCII character\0"
419      "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
420      /* 70 */
421      "internal error: unknown opcode in find_fixedlength()\0"
422      "\\N is not supported in a class\0"
423      "too many forward references\0"
424      ;
425    
426  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
427  patterns. Note that the tables in chartables are dependent on the locale, and  patterns. Note that the tables in chartables are dependent on the locale, and
# Line 302  For convenience, we use the same bit def Line 439  For convenience, we use the same bit def
439    
440  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
441    
442  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
443    
444    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
445    UTF-8 mode. */
446    
447  static const unsigned char digitab[] =  static const unsigned char digitab[] =
448    {    {
449    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 338  static const unsigned char digitab[] = Line 479  static const unsigned char digitab[] =
479    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
480    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
481    
482  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
483    
484    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
485    
486  static const unsigned char digitab[] =  static const unsigned char digitab[] =
487    {    {
488    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 413  static const unsigned char ebcdic_charta Line 557  static const unsigned char ebcdic_charta
557  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
558    
559  static BOOL  static BOOL
560    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
561      int *, int *, branch_chain *, compile_data *, int *);      int *, int *, branch_chain *, compile_data *, int *);
562    
563    
564    
565  /*************************************************  /*************************************************
566    *            Find an error text                  *
567    *************************************************/
568    
569    /* The error texts are now all in one long string, to save on relocations. As
570    some of the text is of unknown length, we can't use a table of offsets.
571    Instead, just count through the strings. This is not a performance issue
572    because it happens only when there has been a compilation error.
573    
574    Argument:   the error number
575    Returns:    pointer to the error string
576    */
577    
578    static const char *
579    find_error_text(int n)
580    {
581    const char *s = error_texts;
582    for (; n > 0; n--)
583      {
584      while (*s++ != 0) {};
585      if (*s == 0) return "Error text not found (please report)";
586      }
587    return s;
588    }
589    
590    
591    /*************************************************
592    *           Expand the workspace                 *
593    *************************************************/
594    
595    /* This function is called during the second compiling phase, if the number of
596    forward references fills the existing workspace, which is originally a block on
597    the stack. A larger block is obtained from malloc() unless the ultimate limit
598    has been reached or the increase will be rather small.
599    
600    Argument: pointer to the compile data block
601    Returns:  0 if all went well, else an error number
602    */
603    
604    static int
605    expand_workspace(compile_data *cd)
606    {
607    uschar *newspace;
608    int newsize = cd->workspace_size * 2;
609    
610    if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
611    if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
612        newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
613     return ERR72;
614    
615    newspace = (pcre_malloc)(newsize);
616    if (newspace == NULL) return ERR21;
617    
618    memcpy(newspace, cd->start_workspace, cd->workspace_size);
619    cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
620    if (cd->workspace_size > COMPILE_WORK_SIZE)
621      (pcre_free)((void *)cd->start_workspace);
622    cd->start_workspace = newspace;
623    cd->workspace_size = newsize;
624    return 0;
625    }
626    
627    
628    
629    /*************************************************
630    *            Check for counted repeat            *
631    *************************************************/
632    
633    /* This function is called when a '{' is encountered in a place where it might
634    start a quantifier. It looks ahead to see if it really is a quantifier or not.
635    It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
636    where the ddds are digits.
637    
638    Arguments:
639      p         pointer to the first char after '{'
640    
641    Returns:    TRUE or FALSE
642    */
643    
644    static BOOL
645    is_counted_repeat(const uschar *p)
646    {
647    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
648    while ((digitab[*p] & ctype_digit) != 0) p++;
649    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
650    
651    if (*p++ != CHAR_COMMA) return FALSE;
652    if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
653    
654    if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
655    while ((digitab[*p] & ctype_digit) != 0) p++;
656    
657    return (*p == CHAR_RIGHT_CURLY_BRACKET);
658    }
659    
660    
661    
662    /*************************************************
663  *            Handle escapes                      *  *            Handle escapes                      *
664  *************************************************/  *************************************************/
665    
# Line 439  Arguments: Line 680  Arguments:
680    
681  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
682                   negative => a special escape sequence                   negative => a special escape sequence
683                   on error, errorptr is set                   on error, errorcodeptr is set
684  */  */
685    
686  static int  static int
# Line 457  ptr--; /* Set Line 698  ptr--; /* Set
698    
699  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
700    
701  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
702  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
703  Otherwise further processing may be required. */  Otherwise further processing may be required. */
704    
705  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
706  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
707  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
708    
709  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
710  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
711  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
712  #endif  #endif
713    
# Line 482  else Line 723  else
723      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
724      error. */      error. */
725    
726      case 'l':      case CHAR_l:
727      case 'L':      case CHAR_L:
     case 'N':  
     case 'u':  
     case 'U':  
728      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
729      break;      break;
730    
731      /* \g must be followed by a number, either plain or braced. If positive, it      case CHAR_u:
732      is an absolute backreference. If negative, it is a relative backreference.      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
733      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a        {
734      reference to a named group. This is part of Perl's movement towards a        /* In JavaScript, \u must be followed by four hexadecimal numbers.
735      unified syntax for back references. As this is synonymous with \k{name}, we        Otherwise it is a lowercase u letter. */
736      fudge it up by pretending it really was \k. */        if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
737               && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
738            {
739            c = 0;
740            for (i = 0; i < 4; ++i)
741              {
742              register int cc = *(++ptr);
743    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
744              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
745              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
746    #else           /* EBCDIC coding */
747              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
748              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
749    #endif
750              }
751            }
752          }
753        else
754          *errorcodeptr = ERR37;
755        break;
756    
757        case CHAR_U:
758        /* In JavaScript, \U is an uppercase U letter. */
759        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
760        break;
761    
762        /* In a character class, \g is just a literal "g". Outside a character
763        class, \g must be followed by one of a number of specific things:
764    
765        (1) A number, either plain or braced. If positive, it is an absolute
766        backreference. If negative, it is a relative backreference. This is a Perl
767        5.10 feature.
768    
769        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
770        is part of Perl's movement towards a unified syntax for back references. As
771        this is synonymous with \k{name}, we fudge it up by pretending it really
772        was \k.
773    
774        (3) For Oniguruma compatibility we also support \g followed by a name or a
775        number either in angle brackets or in single quotes. However, these are
776        (possibly recursive) subroutine calls, _not_ backreferences. Just return
777        the -ESC_g code (cf \k). */
778    
779        case CHAR_g:
780        if (isclass) break;
781        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
782          {
783          c = -ESC_g;
784          break;
785          }
786    
787      case 'g':      /* Handle the Perl-compatible cases */
788      if (ptr[1] == '{')  
789        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
790        {        {
791        const uschar *p;        const uschar *p;
792        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
793          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
794        if (*p != 0 && *p != '}')        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
795          {          {
796          c = -ESC_k;          c = -ESC_k;
797          break;          break;
# Line 513  else Line 801  else
801        }        }
802      else braced = FALSE;      else braced = FALSE;
803    
804      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
805        {        {
806        negated = TRUE;        negated = TRUE;
807        ptr++;        ptr++;
# Line 522  else Line 810  else
810    
811      c = 0;      c = 0;
812      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
813        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - CHAR_0;
814    
815        if (c < 0)   /* Integer overflow */
816          {
817          *errorcodeptr = ERR61;
818          break;
819          }
820    
821      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
822        {        {
823        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
824        return 0;        break;
825          }
826    
827        if (c == 0)
828          {
829          *errorcodeptr = ERR58;
830          break;
831        }        }
832    
833      if (negated)      if (negated)
# Line 535  else Line 835  else
835        if (c > bracount)        if (c > bracount)
836          {          {
837          *errorcodeptr = ERR15;          *errorcodeptr = ERR15;
838          return 0;          break;
839          }          }
840        c = bracount - (c - 1);        c = bracount - (c - 1);
841        }        }
# Line 555  else Line 855  else
855      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
856      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
857    
858      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
859      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
860    
861      if (!isclass)      if (!isclass)
862        {        {
863        oldptr = ptr;        oldptr = ptr;
864        c -= '0';        c -= CHAR_0;
865        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
866          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
867          if (c < 0)    /* Integer overflow */
868            {
869            *errorcodeptr = ERR61;
870            break;
871            }
872        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
873          {          {
874          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 576  else Line 881  else
881      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
882      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
883    
884      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
885        {        {
886        ptr--;        ptr--;
887        c = 0;        c = 0;
# Line 589  else Line 894  else
894      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
895      than 3 octal digits. */      than 3 octal digits. */
896    
897      case '0':      case CHAR_0:
898      c -= '0';      c -= CHAR_0;
899      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
900          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
901      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 255) *errorcodeptr = ERR51;
902      break;      break;
903    
# Line 600  else Line 905  else
905      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
906      treated as a data character. */      treated as a data character. */
907    
908      case 'x':      case CHAR_x:
909      if (ptr[1] == '{')      if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
910          {
911          /* In JavaScript, \x must be followed by two hexadecimal numbers.
912          Otherwise it is a lowercase x letter. */
913          if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
914            {
915            c = 0;
916            for (i = 0; i < 2; ++i)
917              {
918              register int cc = *(++ptr);
919    #ifndef EBCDIC  /* ASCII/UTF-8 coding */
920              if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
921              c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
922    #else           /* EBCDIC coding */
923              if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
924              c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
925    #endif
926              }
927            }
928          break;
929          }
930    
931        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
932        {        {
933        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
934        int count = 0;        int count = 0;
# Line 610  else Line 937  else
937        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
938          {          {
939          register int cc = *pt++;          register int cc = *pt++;
940          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
941          count++;          count++;
942    
943  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
944          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
945          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
946  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
947          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
948          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
949  #endif  #endif
950          }          }
951    
952        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
953          {          {
954          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
955          ptr = pt;          ptr = pt;
# Line 638  else Line 965  else
965      c = 0;      c = 0;
966      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
967        {        {
968        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
969        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
970  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
971        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
972        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
973  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
974        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
975        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
976  #endif  #endif
977        }        }
978      break;      break;
979    
980      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
981      This coding is ASCII-specific, but then the whole concept of \cx is      An error is given if the byte following \c is not an ASCII character. This
982        coding is ASCII-specific, but then the whole concept of \cx is
983      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
984    
985      case 'c':      case CHAR_c:
986      c = *(++ptr);      c = *(++ptr);
987      if (c == 0)      if (c == 0)
988        {        {
989        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
990        return 0;        break;
991        }        }
992    #ifndef EBCDIC    /* ASCII/UTF-8 coding */
993  #ifndef EBCDIC  /* ASCII coding */      if (c > 127)  /* Excludes all non-ASCII in either mode */
994      if (c >= 'a' && c <= 'z') c -= 32;        {
995          *errorcodeptr = ERR68;
996          break;
997          }
998        if (c >= CHAR_a && c <= CHAR_z) c -= 32;
999      c ^= 0x40;      c ^= 0x40;
1000  #else           /* EBCDIC coding */  #else             /* EBCDIC coding */
1001      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
1002      c ^= 0xC0;      c ^= 0xC0;
1003  #endif  #endif
1004      break;      break;
1005    
1006      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1007      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
1008      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
1009      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
1010      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
1011    
1012      default:      default:
1013      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 688  else Line 1020  else
1020      }      }
1021    }    }
1022    
1023    /* Perl supports \N{name} for character names, as well as plain \N for "not
1024    newline". PCRE does not support \N{name}. However, it does support
1025    quantification such as \N{2,3}. */
1026    
1027    if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1028         !is_counted_repeat(ptr+2))
1029      *errorcodeptr = ERR37;
1030    
1031    /* If PCRE_UCP is set, we change the values for \d etc. */
1032    
1033    if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1034      c -= (ESC_DU - ESC_D);
1035    
1036    /* Set the pointer to the final character before returning. */
1037    
1038  *ptrptr = ptr;  *ptrptr = ptr;
1039  return c;  return c;
1040  }  }
# Line 728  if (c == 0) goto ERROR_RETURN; Line 1075  if (c == 0) goto ERROR_RETURN;
1075  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1076  negation. */  negation. */
1077    
1078  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
1079    {    {
1080    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1081      {      {
1082      *negptr = TRUE;      *negptr = TRUE;
1083      ptr++;      ptr++;
# Line 739  if (c == '{') Line 1086  if (c == '{')
1086      {      {
1087      c = *(++ptr);      c = *(++ptr);
1088      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
1089      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1090      name[i] = c;      name[i] = c;
1091      }      }
1092    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1093    name[i] = 0;    name[i] = 0;
1094    }    }
1095    
# Line 764  top = _pcre_utt_size; Line 1111  top = _pcre_utt_size;
1111  while (bot < top)  while (bot < top)
1112    {    {
1113    i = (bot + top) >> 1;    i = (bot + top) >> 1;
1114    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1115    if (c == 0)    if (c == 0)
1116      {      {
1117      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 788  return -1; Line 1135  return -1;
1135    
1136    
1137  /*************************************************  /*************************************************
 *            Check for counted repeat            *  
 *************************************************/  
   
 /* This function is called when a '{' is encountered in a place where it might  
 start a quantifier. It looks ahead to see if it really is a quantifier or not.  
 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}  
 where the ddds are digits.  
   
 Arguments:  
   p         pointer to the first char after '{'  
   
 Returns:    TRUE or FALSE  
 */  
   
 static BOOL  
 is_counted_repeat(const uschar *p)  
 {  
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
 if (*p == '}') return TRUE;  
   
 if (*p++ != ',') return FALSE;  
 if (*p == '}') return TRUE;  
   
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  
 while ((digitab[*p] & ctype_digit) != 0) p++;  
   
 return (*p == '}');  
 }  
   
   
   
 /*************************************************  
1138  *         Read repeat counts                     *  *         Read repeat counts                     *
1139  *************************************************/  *************************************************/
1140    
# Line 848  int max = -1; Line 1162  int max = -1;
1162  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
1163  an integer overflow. */  an integer overflow. */
1164    
1165  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1166  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
1167    {    {
1168    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 858  if (min < 0 || min > 65535) Line 1172  if (min < 0 || min > 65535)
1172  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
1173  Also, max must not be less than min. */  Also, max must not be less than min. */
1174    
1175  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1176    {    {
1177    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1178      {      {
1179      max = 0;      max = 0;
1180      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1181      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
1182        {        {
1183        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 888  return p; Line 1202  return p;
1202    
1203    
1204  /*************************************************  /*************************************************
1205  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1206  *************************************************/  *************************************************/
1207    
1208  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1209    top-level call starts at the beginning of the pattern. All other calls must
1210    start at a parenthesis. It scans along a pattern's text looking for capturing
1211  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1212  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1213  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. Recursion is used to keep
1214  references to subpatterns. We know that if (?P< is encountered, the name will  track of subpatterns that reset the capturing group numbers - the (?| feature.
1215  be terminated by '>' because that is checked in the first pass.  
1216    This function was originally called only from the second pass, in which we know
1217    that if (?< or (?' or (?P< is encountered, the name will be correctly
1218    terminated because that is checked in the first pass. There is now one call to
1219    this function in the first pass, to check for a recursive back reference by
1220    name (so that we can make the whole group atomic). In this case, we need check
1221    only up to the current position in the pattern, and that is still OK because
1222    and previous occurrences will have been checked. To make this work, the test
1223    for "end of pattern" is a check against cd->end_pattern in the main loop,
1224    instead of looking for a binary zero. This means that the special first-pass
1225    call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1226    processing items within the loop are OK, because afterwards the main loop will
1227    terminate.)
1228    
1229  Arguments:  Arguments:
1230    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1231    count        current count of capturing parens so far encountered    cd           compile background data
1232    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1233    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1234    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1235      utf8         TRUE if we are in UTF-8 mode
1236      count        pointer to the current capturing subpattern number (updated)
1237    
1238  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1239  */  */
1240    
1241  static int  static int
1242  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1243    BOOL xmode)    BOOL xmode, BOOL utf8, int *count)
1244  {  {
1245  const uschar *thisname;  uschar *ptr = *ptrptr;
1246    int start_count = *count;
1247    int hwm_count = start_count;
1248    BOOL dup_parens = FALSE;
1249    
1250  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1251    dealing with. The very first call may not start with a parenthesis. */
1252    
1253    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1254    {    {
1255    int term;    /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1256    
1257      if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1258    
1259      /* Handle a normal, unnamed capturing parenthesis. */
1260    
1261      else if (ptr[1] != CHAR_QUESTION_MARK)
1262        {
1263        *count += 1;
1264        if (name == NULL && *count == lorn) return *count;
1265        ptr++;
1266        }
1267    
1268      /* All cases now have (? at the start. Remember when we are in a group
1269      where the parenthesis numbers are duplicated. */
1270    
1271      else if (ptr[2] == CHAR_VERTICAL_LINE)
1272        {
1273        ptr += 3;
1274        dup_parens = TRUE;
1275        }
1276    
1277      /* Handle comments; all characters are allowed until a ket is reached. */
1278    
1279      else if (ptr[2] == CHAR_NUMBER_SIGN)
1280        {
1281        for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1282        goto FAIL_EXIT;
1283        }
1284    
1285      /* Handle a condition. If it is an assertion, just carry on so that it
1286      is processed as normal. If not, skip to the closing parenthesis of the
1287      condition (there can't be any nested parens). */
1288    
1289      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1290        {
1291        ptr += 2;
1292        if (ptr[1] != CHAR_QUESTION_MARK)
1293          {
1294          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1295          if (*ptr != 0) ptr++;
1296          }
1297        }
1298    
1299      /* Start with (? but not a condition. */
1300    
1301      else
1302        {
1303        ptr += 2;
1304        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1305    
1306        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1307    
1308        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1309            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1310          {
1311          int term;
1312          const uschar *thisname;
1313          *count += 1;
1314          if (name == NULL && *count == lorn) return *count;
1315          term = *ptr++;
1316          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1317          thisname = ptr;
1318          while (*ptr != term) ptr++;
1319          if (name != NULL && lorn == ptr - thisname &&
1320              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1321            return *count;
1322          term++;
1323          }
1324        }
1325      }
1326    
1327    /* Past any initial parenthesis handling, scan for parentheses or vertical
1328    bars. Stop if we get to cd->end_pattern. Note that this is important for the
1329    first-pass call when this value is temporarily adjusted to stop at the current
1330    position. So DO NOT change this to a test for binary zero. */
1331    
1332    for (; ptr < cd->end_pattern; ptr++)
1333      {
1334    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1335    
1336    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1337      {      {
1338      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1339      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1340        {        {
1341        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1342        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1343        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1344        }        }
1345      continue;      continue;
1346      }      }
1347    
1348    /* Skip over character classes */    /* Skip over character classes; this logic must be similar to the way they
1349      are handled for real. If the first character is '^', skip it. Also, if the
1350      first few characters (either before or after ^) are \Q\E or \E we skip them
1351      too. This makes for compatibility with Perl. Note the use of STR macros to
1352      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1353    
1354    if (*ptr == '[')    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1355      {      {
1356      while (*(++ptr) != ']')      BOOL negate_class = FALSE;
1357        for (;;)
1358          {
1359          if (ptr[1] == CHAR_BACKSLASH)
1360            {
1361            if (ptr[2] == CHAR_E)
1362              ptr+= 2;
1363            else if (strncmp((const char *)ptr+2,
1364                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1365              ptr += 4;
1366            else
1367              break;
1368            }
1369          else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1370            {
1371            negate_class = TRUE;
1372            ptr++;
1373            }
1374          else break;
1375          }
1376    
1377        /* If the next character is ']', it is a data character that must be
1378        skipped, except in JavaScript compatibility mode. */
1379    
1380        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1381            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1382          ptr++;
1383    
1384        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1385        {        {
1386        if (*ptr == '\\')        if (*ptr == 0) return -1;
1387          if (*ptr == CHAR_BACKSLASH)
1388          {          {
1389          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1390          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1391            {            {
1392            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1393            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1394            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1395            }            }
1396          continue;          continue;
1397          }          }
# Line 955  for (; *ptr != 0; ptr++) Line 1401  for (; *ptr != 0; ptr++)
1401    
1402    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1403    
1404    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1405      {      {
1406      while (*(++ptr) != 0 && *ptr != '\n');      ptr++;
1407      if (*ptr == 0) return -1;      while (*ptr != 0)
1408          {
1409          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1410          ptr++;
1411    #ifdef SUPPORT_UTF8
1412          if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1413    #endif
1414          }
1415        if (*ptr == 0) goto FAIL_EXIT;
1416      continue;      continue;
1417      }      }
1418    
1419    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1420    
1421    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != '?' && ptr[1] != '*')  
1422      {      {
1423      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1424      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1425      continue;      if (*ptr == 0) goto FAIL_EXIT;
1426      }      }
1427    
1428    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1429    if (*ptr == 'P') ptr++;                      /* Allow optional P */      {
1430        if (dup_parens && *count < hwm_count) *count = hwm_count;
1431        goto FAIL_EXIT;
1432        }
1433    
1434    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1435        {
1436        if (*count > hwm_count) hwm_count = *count;
1437        *count = start_count;
1438        }
1439      }
1440    
1441    if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  FAIL_EXIT:
1442         *ptr != '\'')  *ptrptr = ptr;
1443      continue;  return -1;
1444    }
1445    
1446    
1447    
1448    
1449    /*************************************************
1450    *       Find forward referenced subpattern       *
1451    *************************************************/
1452    
1453    /* This function scans along a pattern's text looking for capturing
1454    subpatterns, and counting them. If it finds a named pattern that matches the
1455    name it is given, it returns its number. Alternatively, if the name is NULL, it
1456    returns when it reaches a given numbered subpattern. This is used for forward
1457    references to subpatterns. We used to be able to start this scan from the
1458    current compiling point, using the current count value from cd->bracount, and
1459    do it all in a single loop, but the addition of the possibility of duplicate
1460    subpattern numbers means that we have to scan from the very start, in order to
1461    take account of such duplicates, and to use a recursive function to keep track
1462    of the different types of group.
1463    
1464    count++;  Arguments:
1465      cd           compile background data
1466      name         name to seek, or NULL if seeking a numbered subpattern
1467      lorn         name length, or subpattern number if name is NULL
1468      xmode        TRUE if we are in /x mode
1469      utf8         TRUE if we are in UTF-8 mode
1470    
1471    Returns:       the number of the found subpattern, or -1 if not found
1472    */
1473    
1474    if (name == NULL && count == lorn) return count;  static int
1475    term = *ptr++;  find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1476    if (term == '<') term = '>';    BOOL utf8)
1477    thisname = ptr;  {
1478    while (*ptr != term) ptr++;  uschar *ptr = (uschar *)cd->start_pattern;
1479    if (name != NULL && lorn == ptr - thisname &&  int count = 0;
1480        strncmp((const char *)name, (const char *)thisname, lorn) == 0)  int rc;
1481      return count;  
1482    /* If the pattern does not start with an opening parenthesis, the first call
1483    to find_parens_sub() will scan right to the end (if necessary). However, if it
1484    does start with a parenthesis, find_parens_sub() will return when it hits the
1485    matching closing parens. That is why we have to have a loop. */
1486    
1487    for (;;)
1488      {
1489      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1490      if (rc > 0 || *ptr++ == 0) break;
1491    }    }
1492    
1493  return -1;  return rc;
1494  }  }
1495    
1496    
1497    
1498    
1499  /*************************************************  /*************************************************
1500  *      Find first significant op code            *  *      Find first significant op code            *
1501  *************************************************/  *************************************************/
1502    
1503  /* This is called by several functions that scan a compiled expression looking  /* This is called by several functions that scan a compiled expression looking
1504  for a fixed first character, or an anchoring op code etc. It skips over things  for a fixed first character, or an anchoring op code etc. It skips over things
1505  that do not influence this. For some calls, a change of option is important.  that do not influence this. For some calls, it makes sense to skip negative
1506  For some calls, it makes sense to skip negative forward and all backward  forward and all backward assertions, and also the \b assertion; for others it
1507  assertions, and also the \b assertion; for others it does not.  does not.
1508    
1509  Arguments:  Arguments:
1510    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1511    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1512    
1513  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1514  */  */
1515    
1516  static const uschar*  static const uschar*
1517  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const uschar *code, BOOL skipassert)
   BOOL skipassert)  
1518  {  {
1519  for (;;)  for (;;)
1520    {    {
1521    switch ((int)*code)    switch ((int)*code)
1522      {      {
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
       *options = (int)code[1];  
     code += 2;  
     break;  
   
1523      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1524      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1525      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
# Line 1047  for (;;) Line 1535  for (;;)
1535    
1536      case OP_CALLOUT:      case OP_CALLOUT:
1537      case OP_CREF:      case OP_CREF:
1538        case OP_NCREF:
1539      case OP_RREF:      case OP_RREF:
1540        case OP_NRREF:
1541      case OP_DEF:      case OP_DEF:
1542      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1543      break;      break;
# Line 1063  for (;;) Line 1553  for (;;)
1553    
1554    
1555  /*************************************************  /*************************************************
1556  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1557  *************************************************/  *************************************************/
1558    
1559  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1560  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1561  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1562    temporarily terminated with OP_END when this function is called.
1563    
1564    This function is called when a backward assertion is encountered, so that if it
1565    fails, the error message can point to the correct place in the pattern.
1566    However, we cannot do this when the assertion contains subroutine calls,
1567    because they can be forward references. We solve this by remembering this case
1568    and doing the check at the end; a flag specifies which mode we are running in.
1569    
1570  Arguments:  Arguments:
1571    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1572    options  the compiling options    utf8     TRUE in UTF-8 mode
1573      atend    TRUE if called when the pattern is complete
1574  Returns:   the fixed length, or -1 if there is no fixed length,    cd       the "compile data" structure
1575               or -2 if \C was encountered  
1576    Returns:   the fixed length,
1577                 or -1 if there is no fixed length,
1578                 or -2 if \C was encountered (in UTF-8 mode only)
1579                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1580                 or -4 if an unknown opcode was encountered (internal error)
1581  */  */
1582    
1583  static int  static int
1584  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1585  {  {
1586  int length = -1;  int length = -1;
1587    
# Line 1092  branch, check the length against that of Line 1594  branch, check the length against that of
1594  for (;;)  for (;;)
1595    {    {
1596    int d;    int d;
1597      uschar *ce, *cs;
1598    register int op = *cc;    register int op = *cc;
   
1599    switch (op)    switch (op)
1600      {      {
1601        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1602        OP_BRA (normal non-capturing bracket) because the other variants of these
1603        opcodes are all concerned with unlimited repeated groups, which of course
1604        are not of fixed length. */
1605    
1606      case OP_CBRA:      case OP_CBRA:
1607      case OP_BRA:      case OP_BRA:
1608      case OP_ONCE:      case OP_ONCE:
1609        case OP_ONCE_NC:
1610      case OP_COND:      case OP_COND:
1611      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1612      if (d < 0) return d;      if (d < 0) return d;
1613      branchlength += d;      branchlength += d;
1614      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1615      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
1616      break;      break;
1617    
1618      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested call.
1619      call. If it's ALT it is an alternation in a nested call. If it is      If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1620      END it's the end of the outer call. All can be handled by the same code. */      an ALT. If it is END it's the end of the outer call. All can be handled by
1621        the same code. Note that we must not include the OP_KETRxxx opcodes here,
1622        because they all imply an unlimited repeat. */
1623    
1624      case OP_ALT:      case OP_ALT:
1625      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1626      case OP_END:      case OP_END:
1627        case OP_ACCEPT:
1628        case OP_ASSERT_ACCEPT:
1629      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1630        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
1631      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
# Line 1123  for (;;) Line 1633  for (;;)
1633      branchlength = 0;      branchlength = 0;
1634      break;      break;
1635    
1636        /* A true recursion implies not fixed length, but a subroutine call may
1637        be OK. If the subroutine is a forward reference, we can't deal with
1638        it until the end of the pattern, so return -3. */
1639    
1640        case OP_RECURSE:
1641        if (!atend) return -3;
1642        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1643        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1644        if (cc > cs && cc < ce) return -1;                /* Recursion */
1645        d = find_fixedlength(cs + 2, utf8, atend, cd);
1646        if (d < 0) return d;
1647        branchlength += d;
1648        cc += 1 + LINK_SIZE;
1649        break;
1650    
1651      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1652    
1653      case OP_ASSERT:      case OP_ASSERT:
# Line 1134  for (;;) Line 1659  for (;;)
1659    
1660      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1661    
1662      case OP_REVERSE:      case OP_MARK:
1663        case OP_PRUNE_ARG:
1664        case OP_SKIP_ARG:
1665        case OP_THEN_ARG:
1666        cc += cc[1] + _pcre_OP_lengths[*cc];
1667        break;
1668    
1669        case OP_CALLOUT:
1670        case OP_CIRC:
1671        case OP_CIRCM:
1672        case OP_CLOSE:
1673        case OP_COMMIT:
1674      case OP_CREF:      case OP_CREF:
     case OP_RREF:  
1675      case OP_DEF:      case OP_DEF:
1676      case OP_OPT:      case OP_DOLL:
1677      case OP_CALLOUT:      case OP_DOLLM:
     case OP_SOD:  
     case OP_SOM:  
1678      case OP_EOD:      case OP_EOD:
1679      case OP_EODN:      case OP_EODN:
1680      case OP_CIRC:      case OP_FAIL:
1681      case OP_DOLL:      case OP_NCREF:
1682        case OP_NRREF:
1683      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1684        case OP_PRUNE:
1685        case OP_REVERSE:
1686        case OP_RREF:
1687        case OP_SET_SOM:
1688        case OP_SKIP:
1689        case OP_SOD:
1690        case OP_SOM:
1691        case OP_THEN:
1692      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1693      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
1694      break;      break;
# Line 1154  for (;;) Line 1696  for (;;)
1696      /* Handle literal characters */      /* Handle literal characters */
1697    
1698      case OP_CHAR:      case OP_CHAR:
1699      case OP_CHARNC:      case OP_CHARI:
1700      case OP_NOT:      case OP_NOT:
1701        case OP_NOTI:
1702      branchlength++;      branchlength++;
1703      cc += 2;      cc += 2;
1704  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1705      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       {  
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1706  #endif  #endif
1707      break;      break;
1708    
# Line 1170  for (;;) Line 1710  for (;;)
1710      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
1711    
1712      case OP_EXACT:      case OP_EXACT:
1713        case OP_EXACTI:
1714        case OP_NOTEXACT:
1715        case OP_NOTEXACTI:
1716      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1717      cc += 4;      cc += 4;
1718  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1719      if ((options & PCRE_UTF8) != 0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       {  
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1720  #endif  #endif
1721      break;      break;
1722    
1723      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1724      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1725        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1726      cc += 4;      cc += 4;
1727      break;      break;
1728    
# Line 1192  for (;;) Line 1733  for (;;)
1733      cc += 2;      cc += 2;
1734      /* Fall through */      /* Fall through */
1735    
1736        case OP_HSPACE:
1737        case OP_VSPACE:
1738        case OP_NOT_HSPACE:
1739        case OP_NOT_VSPACE:
1740      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
1741      case OP_DIGIT:      case OP_DIGIT:
1742      case OP_NOT_WHITESPACE:      case OP_NOT_WHITESPACE:
# Line 1199  for (;;) Line 1744  for (;;)
1744      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1745      case OP_WORDCHAR:      case OP_WORDCHAR:
1746      case OP_ANY:      case OP_ANY:
1747        case OP_ALLANY:
1748      branchlength++;      branchlength++;
1749      cc++;      cc++;
1750      break;      break;
1751    
1752      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1753        otherwise \C is coded as OP_ALLANY. */
1754    
1755      case OP_ANYBYTE:      case OP_ANYBYTE:
1756      return -2;      return -2;
# Line 1222  for (;;) Line 1769  for (;;)
1769    
1770      switch (*cc)      switch (*cc)
1771        {        {
1772          case OP_CRPLUS:
1773          case OP_CRMINPLUS:
1774        case OP_CRSTAR:        case OP_CRSTAR:
1775        case OP_CRMINSTAR:        case OP_CRMINSTAR:
1776        case OP_CRQUERY:        case OP_CRQUERY:
# Line 1242  for (;;) Line 1791  for (;;)
1791    
1792      /* Anything else is variable length */      /* Anything else is variable length */
1793    
1794      default:      case OP_ANYNL:
1795        case OP_BRAMINZERO:
1796        case OP_BRAPOS:
1797        case OP_BRAPOSZERO:
1798        case OP_BRAZERO:
1799        case OP_CBRAPOS:
1800        case OP_EXTUNI:
1801        case OP_KETRMAX:
1802        case OP_KETRMIN:
1803        case OP_KETRPOS:
1804        case OP_MINPLUS:
1805        case OP_MINPLUSI:
1806        case OP_MINQUERY:
1807        case OP_MINQUERYI:
1808        case OP_MINSTAR:
1809        case OP_MINSTARI:
1810        case OP_MINUPTO:
1811        case OP_MINUPTOI:
1812        case OP_NOTMINPLUS:
1813        case OP_NOTMINPLUSI:
1814        case OP_NOTMINQUERY:
1815        case OP_NOTMINQUERYI:
1816        case OP_NOTMINSTAR:
1817        case OP_NOTMINSTARI:
1818        case OP_NOTMINUPTO:
1819        case OP_NOTMINUPTOI:
1820        case OP_NOTPLUS:
1821        case OP_NOTPLUSI:
1822        case OP_NOTPOSPLUS:
1823        case OP_NOTPOSPLUSI:
1824        case OP_NOTPOSQUERY:
1825        case OP_NOTPOSQUERYI:
1826        case OP_NOTPOSSTAR:
1827        case OP_NOTPOSSTARI:
1828        case OP_NOTPOSUPTO:
1829        case OP_NOTPOSUPTOI:
1830        case OP_NOTQUERY:
1831        case OP_NOTQUERYI:
1832        case OP_NOTSTAR:
1833        case OP_NOTSTARI:
1834        case OP_NOTUPTO:
1835        case OP_NOTUPTOI:
1836        case OP_PLUS:
1837        case OP_PLUSI:
1838        case OP_POSPLUS:
1839        case OP_POSPLUSI:
1840        case OP_POSQUERY:
1841        case OP_POSQUERYI:
1842        case OP_POSSTAR:
1843        case OP_POSSTARI:
1844        case OP_POSUPTO:
1845        case OP_POSUPTOI:
1846        case OP_QUERY:
1847        case OP_QUERYI:
1848        case OP_REF:
1849        case OP_REFI:
1850        case OP_SBRA:
1851        case OP_SBRAPOS:
1852        case OP_SCBRA:
1853        case OP_SCBRAPOS:
1854        case OP_SCOND:
1855        case OP_SKIPZERO:
1856        case OP_STAR:
1857        case OP_STARI:
1858        case OP_TYPEMINPLUS:
1859        case OP_TYPEMINQUERY:
1860        case OP_TYPEMINSTAR:
1861        case OP_TYPEMINUPTO:
1862        case OP_TYPEPLUS:
1863        case OP_TYPEPOSPLUS:
1864        case OP_TYPEPOSQUERY:
1865        case OP_TYPEPOSSTAR:
1866        case OP_TYPEPOSUPTO:
1867        case OP_TYPEQUERY:
1868        case OP_TYPESTAR:
1869        case OP_TYPEUPTO:
1870        case OP_UPTO:
1871        case OP_UPTOI:
1872      return -1;      return -1;
1873    
1874        /* Catch unrecognized opcodes so that when new ones are added they
1875        are not forgotten, as has happened in the past. */
1876    
1877        default:
1878        return -4;
1879      }      }
1880    }    }
1881  /* Control never gets here */  /* Control never gets here */
# Line 1253  for (;;) Line 1885  for (;;)
1885    
1886    
1887  /*************************************************  /*************************************************
1888  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1889  *************************************************/  *************************************************/
1890    
1891  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1892  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1893    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1894    so that it can be called from pcre_study() when finding the minimum matching
1895    length.
1896    
1897  Arguments:  Arguments:
1898    code        points to start of expression    code        points to start of expression
1899    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1900    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1901    
1902  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1903  */  */
1904    
1905  static const uschar *  const uschar *
1906  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1907  {  {
1908  for (;;)  for (;;)
1909    {    {
1910    register int c = *code;    register int c = *code;
1911    
1912    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1913    
1914    /* XCLASS is used for classes that cannot be represented just by a bit    /* XCLASS is used for classes that cannot be represented just by a bit
# Line 1281  for (;;) Line 1917  for (;;)
1917    
1918    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1919    
1920      /* Handle recursion */
1921    
1922      else if (c == OP_REVERSE)
1923        {
1924        if (number < 0) return (uschar *)code;
1925        code += _pcre_OP_lengths[c];
1926        }
1927    
1928    /* Handle capturing bracket */    /* Handle capturing bracket */
1929    
1930    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1931               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1932      {      {
1933      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1934      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1935      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1936      }      }
1937    
1938    /* In UTF-8 mode, opcodes that are followed by a character may be followed by    /* Otherwise, we can get the item's length from the table, except that for
1939    a multi-byte character. The length in the table is a minimum, so we have to    repeated character types, we have to test for \p and \P, which have an extra
1940    arrange to skip the extra bytes. */    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1941      must add in its length. */
1942    
1943    else    else
1944      {      {
1945        switch(c)
1946          {
1947          case OP_TYPESTAR:
1948          case OP_TYPEMINSTAR:
1949          case OP_TYPEPLUS:
1950          case OP_TYPEMINPLUS:
1951          case OP_TYPEQUERY:
1952          case OP_TYPEMINQUERY:
1953          case OP_TYPEPOSSTAR:
1954          case OP_TYPEPOSPLUS:
1955          case OP_TYPEPOSQUERY:
1956          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1957          break;
1958    
1959          case OP_TYPEUPTO:
1960          case OP_TYPEMINUPTO:
1961          case OP_TYPEEXACT:
1962          case OP_TYPEPOSUPTO:
1963          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1964          break;
1965    
1966          case OP_MARK:
1967          case OP_PRUNE_ARG:
1968          case OP_SKIP_ARG:
1969          code += code[1];
1970          break;
1971    
1972          case OP_THEN_ARG:
1973          code += code[1];
1974          break;
1975          }
1976    
1977        /* Add in the fixed length from the table */
1978    
1979      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1980    
1981      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1982      a multi-byte character. The length in the table is a minimum, so we have to
1983      arrange to skip the extra bytes. */
1984    
1985  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1986      if (utf8) switch(c)      if (utf8) switch(c)
1987        {        {
1988        case OP_CHAR:        case OP_CHAR:
1989        case OP_CHARNC:        case OP_CHARI:
1990        case OP_EXACT:        case OP_EXACT:
1991          case OP_EXACTI:
1992        case OP_UPTO:        case OP_UPTO:
1993          case OP_UPTOI:
1994        case OP_MINUPTO:        case OP_MINUPTO:
1995          case OP_MINUPTOI:
1996        case OP_POSUPTO:        case OP_POSUPTO:
1997          case OP_POSUPTOI:
1998        case OP_STAR:        case OP_STAR:
1999          case OP_STARI:
2000        case OP_MINSTAR:        case OP_MINSTAR:
2001          case OP_MINSTARI:
2002        case OP_POSSTAR:        case OP_POSSTAR:
2003          case OP_POSSTARI:
2004        case OP_PLUS:        case OP_PLUS:
2005          case OP_PLUSI:
2006        case OP_MINPLUS:        case OP_MINPLUS:
2007          case OP_MINPLUSI:
2008        case OP_POSPLUS:        case OP_POSPLUS:
2009          case OP_POSPLUSI:
2010        case OP_QUERY:        case OP_QUERY:
2011          case OP_QUERYI:
2012        case OP_MINQUERY:        case OP_MINQUERY:
2013          case OP_MINQUERYI:
2014        case OP_POSQUERY:        case OP_POSQUERY:
2015          case OP_POSQUERYI:
2016        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2017        break;        break;
2018        }        }
2019    #else
2020        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2021  #endif  #endif
2022      }      }
2023    }    }
# Line 1354  for (;;) Line 2054  for (;;)
2054    
2055    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
2056    
2057    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we can get the item's length from the table, except that for
2058    that are followed by a character may be followed by a multi-byte character.    repeated character types, we have to test for \p and \P, which have an extra
2059    The length in the table is a minimum, so we have to arrange to skip the extra    two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2060    bytes. */    must add in its length. */
2061    
2062    else    else
2063      {      {
2064        switch(c)
2065          {
2066          case OP_TYPESTAR:
2067          case OP_TYPEMINSTAR:
2068          case OP_TYPEPLUS:
2069          case OP_TYPEMINPLUS:
2070          case OP_TYPEQUERY:
2071          case OP_TYPEMINQUERY:
2072          case OP_TYPEPOSSTAR:
2073          case OP_TYPEPOSPLUS:
2074          case OP_TYPEPOSQUERY:
2075          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2076          break;
2077    
2078          case OP_TYPEPOSUPTO:
2079          case OP_TYPEUPTO:
2080          case OP_TYPEMINUPTO:
2081          case OP_TYPEEXACT:
2082          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2083          break;
2084    
2085          case OP_MARK:
2086          case OP_PRUNE_ARG:
2087          case OP_SKIP_ARG:
2088          code += code[1];
2089          break;
2090    
2091          case OP_THEN_ARG:
2092          code += code[1];
2093          break;
2094          }
2095    
2096        /* Add in the fixed length from the table */
2097    
2098      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
2099    
2100        /* In UTF-8 mode, opcodes that are followed by a character may be followed
2101        by a multi-byte character. The length in the table is a minimum, so we have
2102        to arrange to skip the extra bytes. */
2103    
2104  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2105      if (utf8) switch(c)      if (utf8) switch(c)
2106        {        {
2107        case OP_CHAR:        case OP_CHAR:
2108        case OP_CHARNC:        case OP_CHARI:
2109        case OP_EXACT:        case OP_EXACT:
2110          case OP_EXACTI:
2111        case OP_UPTO:        case OP_UPTO:
2112          case OP_UPTOI:
2113        case OP_MINUPTO:        case OP_MINUPTO:
2114          case OP_MINUPTOI:
2115        case OP_POSUPTO:        case OP_POSUPTO:
2116          case OP_POSUPTOI:
2117        case OP_STAR:        case OP_STAR:
2118          case OP_STARI:
2119        case OP_MINSTAR:        case OP_MINSTAR:
2120          case OP_MINSTARI:
2121        case OP_POSSTAR:        case OP_POSSTAR:
2122          case OP_POSSTARI:
2123        case OP_PLUS:        case OP_PLUS:
2124          case OP_PLUSI:
2125        case OP_MINPLUS:        case OP_MINPLUS:
2126          case OP_MINPLUSI:
2127        case OP_POSPLUS:        case OP_POSPLUS:
2128          case OP_POSPLUSI:
2129        case OP_QUERY:        case OP_QUERY:
2130          case OP_QUERYI:
2131        case OP_MINQUERY:        case OP_MINQUERY:
2132          case OP_MINQUERYI:
2133        case OP_POSQUERY:        case OP_POSQUERY:
2134          case OP_POSQUERYI:
2135        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2136        break;        break;
2137        }        }
2138    #else
2139        (void)(utf8);  /* Keep compiler happy by referencing function argument */
2140  #endif  #endif
2141      }      }
2142    }    }
# Line 1398  for (;;) Line 2152  for (;;)
2152  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
2153  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
2154  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
2155  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
2156  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
2157    bracket whose current branch will already have been scanned.
2158    
2159  Arguments:  Arguments:
2160    code        points to start of search    code        points to start of search
2161    endcode     points to where to stop    endcode     points to where to stop
2162    utf8        TRUE if in UTF8 mode    utf8        TRUE if in UTF8 mode
2163      cd          contains pointers to tables etc.
2164    
2165  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2166  */  */
2167    
2168  static BOOL  static BOOL
2169  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2170      compile_data *cd)
2171  {  {
2172  register int c;  register int c;
2173  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2174       code < endcode;       code < endcode;
2175       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2176    {    {
2177    const uschar *ccode;    const uschar *ccode;
2178    
2179    c = *code;    c = *code;
2180    
2181      /* Skip over forward assertions; the other assertions are skipped by
2182      first_significant_code() with a TRUE final argument. */
2183    
2184      if (c == OP_ASSERT)
2185        {
2186        do code += GET(code, 1); while (*code == OP_ALT);
2187        c = *code;
2188        continue;
2189        }
2190    
2191      /* For a recursion/subroutine call, if its end has been reached, which
2192      implies a backward reference subroutine call, we can scan it. If it's a
2193      forward reference subroutine call, we can't. To detect forward reference
2194      we have to scan up the list that is kept in the workspace. This function is
2195      called only when doing the real compile, not during the pre-compile that
2196      measures the size of the compiled pattern. */
2197    
2198      if (c == OP_RECURSE)
2199        {
2200        const uschar *scode;
2201        BOOL empty_branch;
2202    
2203        /* Test for forward reference */
2204    
2205        for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2206          if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2207    
2208        /* Not a forward reference, test for completed backward reference */
2209    
2210        empty_branch = FALSE;
2211        scode = cd->start_code + GET(code, 1);
2212        if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2213    
2214        /* Completed backwards reference */
2215    
2216        do
2217          {
2218          if (could_be_empty_branch(scode, endcode, utf8, cd))
2219            {
2220            empty_branch = TRUE;
2221            break;
2222            }
2223          scode += GET(scode, 1);
2224          }
2225        while (*scode == OP_ALT);
2226    
2227        if (!empty_branch) return FALSE;  /* All branches are non-empty */
2228        continue;
2229        }
2230    
2231    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
2232    
2233    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2234          c == OP_BRAPOSZERO)
2235        {
2236        code += _pcre_OP_lengths[c];
2237        do code += GET(code, 1); while (*code == OP_ALT);
2238        c = *code;
2239        continue;
2240        }
2241    
2242      /* A nested group that is already marked as "could be empty" can just be
2243      skipped. */
2244    
2245      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2246          c == OP_SCBRA || c == OP_SCBRAPOS)
2247      {      {
     code += _pcre_OP_lengths[c];  
2248      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
2249      c = *code;      c = *code;
2250      continue;      continue;
# Line 1433  for (code = first_significant_code(code Line 2252  for (code = first_significant_code(code
2252    
2253    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2254    
2255    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2256          c == OP_CBRA || c == OP_CBRAPOS ||
2257          c == OP_ONCE || c == OP_ONCE_NC ||
2258          c == OP_COND)
2259      {      {
2260      BOOL empty_branch;      BOOL empty_branch;
2261      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2262    
2263      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
2264        empty branch, so just skip over the conditional, because it could be empty.
2265        Otherwise, scan the individual branches of the group. */
2266    
2267      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
2268        code += GET(code, 1);        code += GET(code, 1);
2269        else
2270          {
2271          empty_branch = FALSE;
2272          do
2273            {
2274            if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2275              empty_branch = TRUE;
2276            code += GET(code, 1);
2277            }
2278          while (*code == OP_ALT);
2279          if (!empty_branch) return FALSE;   /* All branches are non-empty */
2280        }        }
2281      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
2282      c = *code;      c = *code;
2283      continue;      continue;
2284      }      }
# Line 1457  for (code = first_significant_code(code Line 2287  for (code = first_significant_code(code
2287    
2288    switch (c)    switch (c)
2289      {      {
2290      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
2291        cannot be represented just by a bit map. This includes negated single
2292        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2293        actual length is stored in the compiled code, so we must update "code"
2294        here. */
2295    
2296  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2297      case OP_XCLASS:      case OP_XCLASS:
2298      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
2299      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
2300  #endif  #endif
2301    
# Line 1505  for (code = first_significant_code(code Line 2339  for (code = first_significant_code(code
2339      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
2340      case OP_WORDCHAR:      case OP_WORDCHAR:
2341      case OP_ANY:      case OP_ANY:
2342        case OP_ALLANY:
2343      case OP_ANYBYTE:      case OP_ANYBYTE:
2344      case OP_CHAR:      case OP_CHAR:
2345      case OP_CHARNC:      case OP_CHARI:
2346      case OP_NOT:      case OP_NOT:
2347        case OP_NOTI:
2348      case OP_PLUS:      case OP_PLUS:
2349      case OP_MINPLUS:      case OP_MINPLUS:
2350      case OP_POSPLUS:      case OP_POSPLUS:
# Line 1523  for (code = first_significant_code(code Line 2359  for (code = first_significant_code(code
2359      case OP_TYPEEXACT:      case OP_TYPEEXACT:
2360      return FALSE;      return FALSE;
2361    
2362        /* These are going to continue, as they may be empty, but we have to
2363        fudge the length for the \p and \P cases. */
2364    
2365        case OP_TYPESTAR:
2366        case OP_TYPEMINSTAR:
2367        case OP_TYPEPOSSTAR:
2368        case OP_TYPEQUERY:
2369        case OP_TYPEMINQUERY:
2370        case OP_TYPEPOSQUERY:
2371        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2372        break;
2373    
2374        /* Same for these */
2375    
2376        case OP_TYPEUPTO:
2377        case OP_TYPEMINUPTO:
2378        case OP_TYPEPOSUPTO:
2379        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2380        break;
2381    
2382      /* End of branch */      /* End of branch */
2383    
2384      case OP_KET:      case OP_KET:
2385      case OP_KETRMAX:      case OP_KETRMAX:
2386      case OP_KETRMIN:      case OP_KETRMIN:
2387        case OP_KETRPOS:
2388      case OP_ALT:      case OP_ALT:
2389      return TRUE;      return TRUE;
2390    
# Line 1536  for (code = first_significant_code(code Line 2393  for (code = first_significant_code(code
2393    
2394  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2395      case OP_STAR:      case OP_STAR:
2396        case OP_STARI:
2397      case OP_MINSTAR:      case OP_MINSTAR:
2398        case OP_MINSTARI:
2399      case OP_POSSTAR:      case OP_POSSTAR:
2400        case OP_POSSTARI:
2401      case OP_QUERY:      case OP_QUERY:
2402        case OP_QUERYI:
2403      case OP_MINQUERY:      case OP_MINQUERY:
2404        case OP_MINQUERYI:
2405      case OP_POSQUERY:      case OP_POSQUERY:
2406        case OP_POSQUERYI:
2407        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2408        break;
2409    
2410      case OP_UPTO:      case OP_UPTO:
2411        case OP_UPTOI:
2412      case OP_MINUPTO:      case OP_MINUPTO:
2413        case OP_MINUPTOI:
2414      case OP_POSUPTO:      case OP_POSUPTO:
2415      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTOI:
2416        if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2417      break;      break;
2418  #endif  #endif
2419    
2420        /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2421        string. */
2422    
2423        case OP_MARK:
2424        case OP_PRUNE_ARG:
2425        case OP_SKIP_ARG:
2426        code += code[1];
2427        break;
2428    
2429        case OP_THEN_ARG:
2430        code += code[1];
2431        break;
2432    
2433        /* None of the remaining opcodes are required to match a character. */
2434    
2435        default:
2436        break;
2437      }      }
2438    }    }
2439    
# Line 1563  return TRUE; Line 2450  return TRUE;
2450  the current branch of the current pattern to see if it could match the empty  the current branch of the current pattern to see if it could match the empty
2451  string. If it could, we must look outwards for branches at other levels,  string. If it could, we must look outwards for branches at other levels,
2452  stopping when we pass beyond the bracket which is the subject of the recursion.  stopping when we pass beyond the bracket which is the subject of the recursion.
2453    This function is called only during the real compile, not during the
2454    pre-compile.
2455    
2456  Arguments:  Arguments:
2457    code        points to start of the recursion    code        points to start of the recursion
2458    endcode     points to where to stop (current RECURSE item)    endcode     points to where to stop (current RECURSE item)
2459    bcptr       points to the chain of current (unclosed) branch starts    bcptr       points to the chain of current (unclosed) branch starts
2460    utf8        TRUE if in UTF-8 mode    utf8        TRUE if in UTF-8 mode
2461      cd          pointers to tables etc
2462    
2463  Returns:      TRUE if what is matched could be empty  Returns:      TRUE if what is matched could be empty
2464  */  */
2465    
2466  static BOOL  static BOOL
2467  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2468    BOOL utf8)    BOOL utf8, compile_data *cd)
2469  {  {
2470  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
2471    {    {
2472    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2473        return FALSE;
2474    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2475    }    }
2476  return TRUE;  return TRUE;
# Line 1592  return TRUE; Line 2483  return TRUE;
2483  *************************************************/  *************************************************/
2484    
2485  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
2486  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
2487  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2488  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
2489    
2490    Originally, this function only recognized a sequence of letters between the
2491    terminators, but it seems that Perl recognizes any sequence of characters,
2492    though of course unknown POSIX names are subsequently rejected. Perl gives an
2493    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2494    didn't consider this to be a POSIX class. Likewise for [:1234:].
2495    
2496    The problem in trying to be exactly like Perl is in the handling of escapes. We
2497    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2498    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2499    below handles the special case of \], but does not try to do any other escape
2500    processing. This makes it different from Perl for cases such as [:l\ower:]
2501    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2502    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2503    I think.
2504    
2505    A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2506    It seems that the appearance of a nested POSIX class supersedes an apparent
2507    external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2508    a digit.
2509    
2510    In Perl, unescaped square brackets may also appear as part of class names. For
2511    example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2512    [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2513    seem right at all. PCRE does not allow closing square brackets in POSIX class
2514    names.
2515    
2516  Argument:  Arguments:
2517    ptr      pointer to the initial [    ptr      pointer to the initial [
2518    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
2519    
2520  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
2521  */  */
2522    
2523  static BOOL  static BOOL
2524  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
2525  {  {
2526  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2527  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2528  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
2529    {    {
2530    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2531    return TRUE;      ptr++;
2532      else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2533      else
2534        {
2535        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2536          {
2537          *endptr = ptr;
2538          return TRUE;
2539          }
2540        if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2541             (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2542              ptr[1] == CHAR_EQUALS_SIGN) &&
2543            check_posix_syntax(ptr, endptr))
2544          return FALSE;
2545        }
2546    }    }
2547  return FALSE;  return FALSE;
2548  }  }
# Line 1639  Returns: a value representing the na Line 2567  Returns: a value representing the na
2567  static int  static int
2568  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
2569  {  {
2570    const char *pn = posix_names;
2571  register int yield = 0;  register int yield = 0;
2572  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2573    {    {
2574    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2575      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
2576      pn += posix_name_lengths[yield] + 1;
2577    yield++;    yield++;
2578    }    }
2579  return -1;  return -1;
# Line 1658  return -1; Line 2588  return -1;
2588  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2589  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2590  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2591  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2592  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2593  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2594  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
2595  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2596    OP_END.
2597    
2598  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
2599  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1685  adjust_recurse(uschar *group, int adjust Line 2616  adjust_recurse(uschar *group, int adjust
2616    uschar *save_hwm)    uschar *save_hwm)
2617  {  {
2618  uschar *ptr = group;  uschar *ptr = group;
2619    
2620  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2621    {    {
2622    int offset;    int offset;
# Line 1738  auto_callout(uschar *code, const uschar Line 2670  auto_callout(uschar *code, const uschar
2670  {  {
2671  *code++ = OP_CALLOUT;  *code++ = OP_CALLOUT;
2672  *code++ = 255;  *code++ = 255;
2673  PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */  PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2674  PUT(code, LINK_SIZE, 0);                /* Default length */  PUT(code, LINK_SIZE, 0);                       /* Default length */
2675  return code + 2*LINK_SIZE;  return code + 2*LINK_SIZE;
2676  }  }
2677    
# Line 1764  Returns: nothing Line 2696  Returns: nothing
2696  static void  static void
2697  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2698  {  {
2699  int length = ptr - cd->start_pattern - GET(previous_callout, 2);  int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2700  PUT(previous_callout, 2 + LINK_SIZE, length);  PUT(previous_callout, 2 + LINK_SIZE, length);
2701  }  }
2702    
# Line 1796  get_othercase_range(unsigned int *cptr, Line 2728  get_othercase_range(unsigned int *cptr,
2728  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2729    
2730  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2731    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2732    
2733  if (c > d) return FALSE;  if (c > d) return FALSE;
2734    
# Line 1805  next = othercase + 1; Line 2737  next = othercase + 1;
2737    
2738  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2739    {    {
2740    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2741    next++;    next++;
2742    }    }
2743    
# Line 1814  for (++c; c <= d; c++) Line 2746  for (++c; c <= d; c++)
2746    
2747  return TRUE;  return TRUE;
2748  }  }
2749    
2750    
2751    
2752    /*************************************************
2753    *        Check a character and a property        *
2754    *************************************************/
2755    
2756    /* This function is called by check_auto_possessive() when a property item
2757    is adjacent to a fixed character.
2758    
2759    Arguments:
2760      c            the character
2761      ptype        the property type
2762      pdata        the data for the type
2763      negated      TRUE if it's a negated property (\P or \p{^)
2764    
2765    Returns:       TRUE if auto-possessifying is OK
2766    */
2767    
2768    static BOOL
2769    check_char_prop(int c, int ptype, int pdata, BOOL negated)
2770    {
2771    const ucd_record *prop = GET_UCD(c);
2772    switch(ptype)
2773      {
2774      case PT_LAMP:
2775      return (prop->chartype == ucp_Lu ||
2776              prop->chartype == ucp_Ll ||
2777              prop->chartype == ucp_Lt) == negated;
2778    
2779      case PT_GC:
2780      return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2781    
2782      case PT_PC:
2783      return (pdata == prop->chartype) == negated;
2784    
2785      case PT_SC:
2786      return (pdata == prop->script) == negated;
2787    
2788      /* These are specials */
2789    
2790      case PT_ALNUM:
2791      return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2792              _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2793    
2794      case PT_SPACE:    /* Perl space */
2795      return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2796              c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2797              == negated;
2798    
2799      case PT_PXSPACE:  /* POSIX space */
2800      return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2801              c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2802              c == CHAR_FF || c == CHAR_CR)
2803              == negated;
2804    
2805      case PT_WORD:
2806      return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2807              _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2808              c == CHAR_UNDERSCORE) == negated;
2809      }
2810    return FALSE;
2811    }
2812  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2813    
2814    
# Line 1827  whether the next thing could possibly ma Line 2822  whether the next thing could possibly ma
2822  sense to automatically possessify the repeated item.  sense to automatically possessify the repeated item.
2823    
2824  Arguments:  Arguments:
2825    op_code       the repeated op code    previous      pointer to the repeated opcode
   this          data for this item, depends on the opcode  
2826    utf8          TRUE in UTF-8 mode    utf8          TRUE in UTF-8 mode
   utf8_char     used for utf8 character bytes, NULL if not relevant  
2827    ptr           next character in pattern    ptr           next character in pattern
2828    options       options bits    options       options bits
2829    cd            contains pointers to tables etc.    cd            contains pointers to tables etc.
# Line 1839  Returns: TRUE if possessifying is Line 2832  Returns: TRUE if possessifying is
2832  */  */
2833    
2834  static BOOL  static BOOL
2835  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2836    const uschar *ptr, int options, compile_data *cd)    int options, compile_data *cd)
2837  {  {
2838  int next;  int c, next;
2839    int op_code = *previous++;
2840    
2841  /* Skip whitespace and comments in extended mode */  /* Skip whitespace and comments in extended mode */
2842    
# Line 1851  if ((options & PCRE_EXTENDED) != 0) Line 2845  if ((options & PCRE_EXTENDED) != 0)
2845    for (;;)    for (;;)
2846      {      {
2847      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2848      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2849        {        {
2850        while (*(++ptr) != 0)        ptr++;
2851          while (*ptr != 0)
2852            {
2853          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2854            ptr++;
2855    #ifdef SUPPORT_UTF8
2856            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2857    #endif
2858            }
2859        }        }
2860      else break;      else break;
2861      }      }
# Line 1863  if ((options & PCRE_EXTENDED) != 0) Line 2864  if ((options & PCRE_EXTENDED) != 0)
2864  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
2865  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
2866    
2867  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
2868    {    {
2869    int temperrorcode = 0;    int temperrorcode = 0;
2870    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
# Line 1888  if ((options & PCRE_EXTENDED) != 0) Line 2889  if ((options & PCRE_EXTENDED) != 0)
2889    for (;;)    for (;;)
2890      {      {
2891      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2892      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2893        {        {
2894        while (*(++ptr) != 0)        ptr++;
2895          while (*ptr != 0)
2896            {
2897          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2898            ptr++;
2899    #ifdef SUPPORT_UTF8
2900            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2901    #endif
2902            }
2903        }        }
2904      else break;      else break;
2905      }      }
# Line 1899  if ((options & PCRE_EXTENDED) != 0) Line 2907  if ((options & PCRE_EXTENDED) != 0)
2907    
2908  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2909    
2910  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2911    return FALSE;    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2912        return FALSE;
 /* Now compare the next item with the previous opcode. If the previous is a  
 positive single character match, "item" either contains the character or, if  
 "item" is greater than 127 in utf8 mode, the character's bytes are in  
 utf8_char. */  
   
2913    
2914  /* Handle cases when the next item is a character. */  /* Now compare the next item with the previous opcode. First, handle cases when
2915    the next item is a character. */
2916    
2917  if (next >= 0) switch(op_code)  if (next >= 0) switch(op_code)
2918    {    {
2919    case OP_CHAR:    case OP_CHAR:
2920  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2921    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2922    #else
2923      c = *previous;
2924  #endif  #endif
2925    return item != next;    return c != next;
2926    
2927    /* For CHARNC (caseless character) we must check the other case. If we have    /* For CHARI (caseless character) we must check the other case. If we have
2928    Unicode property support, we can use it to test the other case of    Unicode property support, we can use it to test the other case of
2929    high-valued characters. */    high-valued characters. */
2930    
2931    case OP_CHARNC:    case OP_CHARI:
2932  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2933    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
2934    #else
2935      c = *previous;
2936  #endif  #endif
2937    if (item == next) return FALSE;    if (c == next) return FALSE;
2938  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2939    if (utf8)    if (utf8)
2940      {      {
2941      unsigned int othercase;      unsigned int othercase;
2942      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2943  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2944      othercase = _pcre_ucp_othercase((unsigned int)next);      othercase = UCD_OTHERCASE((unsigned int)next);
2945  #else  #else
2946      othercase = NOTACHAR;      othercase = NOTACHAR;
2947  #endif  #endif
2948      return (unsigned int)item != othercase;      return (unsigned int)c != othercase;
2949      }      }
2950    else    else
2951  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2952    return (item != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2953    
2954    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2955      opcodes are not used for multi-byte characters, because they are coded using
2956      an XCLASS instead. */
2957    
2958    case OP_NOT:    case OP_NOT:
2959    if (next < 0) return FALSE;  /* Not a character */    return (c = *previous) == next;
2960    if (item == next) return TRUE;  
2961    if ((options & PCRE_CASELESS) == 0) return FALSE;    case OP_NOTI:
2962      if ((c = *previous) == next) return TRUE;
2963  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2964    if (utf8)    if (utf8)
2965      {      {
2966      unsigned int othercase;      unsigned int othercase;
2967      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2968  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2969      othercase = _pcre_ucp_othercase(next);      othercase = UCD_OTHERCASE(next);
2970  #else  #else
2971      othercase = NOTACHAR;      othercase = NOTACHAR;
2972  #endif  #endif
2973      return (unsigned int)item == othercase;      return (unsigned int)c == othercase;
2974      }      }
2975    else    else
2976  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF8 */
2977    return (item == cd->fcc[next]);  /* Non-UTF-8 mode */    return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2978    
2979      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2980      When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2981    
2982    case OP_DIGIT:    case OP_DIGIT:
2983    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
# Line 2006  if (next >= 0) switch(op_code) Line 3020  if (next >= 0) switch(op_code)
3020      case 0x202f:      case 0x202f:
3021      case 0x205f:      case 0x205f:
3022      case 0x3000:      case 0x3000:
3023      return op_code != OP_HSPACE;      return op_code == OP_NOT_HSPACE;
3024      default:      default:
3025      return op_code == OP_HSPACE;      return op_code != OP_NOT_HSPACE;
3026      }      }
3027    
3028      case OP_ANYNL:
3029    case OP_VSPACE:    case OP_VSPACE:
3030    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3031    switch(next)    switch(next)
# Line 2022  if (next >= 0) switch(op_code) Line 3037  if (next >= 0) switch(op_code)
3037      case 0x85:      case 0x85:
3038      case 0x2028:      case 0x2028:
3039      case 0x2029:      case 0x2029:
3040      return op_code != OP_VSPACE;      return op_code == OP_NOT_VSPACE;
3041      default:      default:
3042      return op_code == OP_VSPACE;      return op_code != OP_NOT_VSPACE;
3043      }      }
3044    
3045    #ifdef SUPPORT_UCP
3046      case OP_PROP:
3047      return check_char_prop(next, previous[0], previous[1], FALSE);
3048    
3049      case OP_NOTPROP:
3050      return check_char_prop(next, previous[0], previous[1], TRUE);
3051    #endif
3052    
3053    default:    default:
3054    return FALSE;    return FALSE;
3055    }    }
3056    
3057    
3058  /* Handle the case when the next item is \d, \s, etc. */  /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3059    is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3060    generated only when PCRE_UCP is *not* set, that is, when only ASCII
3061    characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3062    replaced by OP_PROP codes when PCRE_UCP is set. */
3063    
3064  switch(op_code)  switch(op_code)
3065    {    {
3066    case OP_CHAR:    case OP_CHAR:
3067    case OP_CHARNC:    case OP_CHARI:
3068  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3069    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    GETCHARTEST(c, previous);
3070    #else
3071      c = *previous;
3072  #endif  #endif
3073    switch(-next)    switch(-next)
3074      {      {
3075      case ESC_d:      case ESC_d:
3076      return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;      return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3077    
3078      case ESC_D:      case ESC_D:
3079      return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3080    
3081      case ESC_s:      case ESC_s:
3082      return item > 127 || (cd->ctypes[item] & ctype_space) == 0;      return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3083    
3084      case ESC_S:      case ESC_S:
3085      return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3086    
3087      case ESC_w:      case ESC_w:
3088      return item > 127 || (cd->ctypes[item] & ctype_word) == 0;      return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3089    
3090      case ESC_W:      case ESC_W:
3091      return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;      return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3092    
3093      case ESC_h:      case ESC_h:
3094      case ESC_H:      case ESC_H:
3095      switch(item)      switch(c)
3096        {        {
3097        case 0x09:        case 0x09:
3098        case 0x20:        case 0x20:
# Line 2091  switch(op_code) Line 3120  switch(op_code)
3120    
3121      case ESC_v:      case ESC_v:
3122      case ESC_V:      case ESC_V:
3123      switch(item)      switch(c)
3124        {        {
3125        case 0x0a:        case 0x0a:
3126        case 0x0b:        case 0x0b:
# Line 2105  switch(op_code) Line 3134  switch(op_code)
3134        return -next == ESC_v;        return -next == ESC_v;
3135        }        }
3136    
3137        /* When PCRE_UCP is set, these values get generated for \d etc. Find
3138        their substitutions and process them. The result will always be either
3139        -ESC_p or -ESC_P. Then fall through to process those values. */
3140    
3141    #ifdef SUPPORT_UCP
3142        case ESC_du:
3143        case ESC_DU:
3144        case ESC_wu:
3145        case ESC_WU:
3146        case ESC_su:
3147        case ESC_SU:
3148          {
3149          int temperrorcode = 0;
3150          ptr = substitutes[-next - ESC_DU];
3151          next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3152          if (temperrorcode != 0) return FALSE;
3153          ptr++;    /* For compatibility */
3154          }
3155        /* Fall through */
3156    
3157        case ESC_p:
3158        case ESC_P:
3159          {
3160          int ptype, pdata, errorcodeptr;
3161          BOOL negated;
3162    
3163          ptr--;      /* Make ptr point at the p or P */
3164          ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3165          if (ptype < 0) return FALSE;
3166          ptr++;      /* Point past the final curly ket */
3167    
3168          /* If the property item is optional, we have to give up. (When generated
3169          from \d etc by PCRE_UCP, this test will have been applied much earlier,
3170          to the original \d etc. At this point, ptr will point to a zero byte. */
3171    
3172          if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3173            strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3174              return FALSE;
3175    
3176          /* Do the property check. */
3177    
3178          return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3179          }
3180    #endif
3181    
3182      default:      default:
3183      return FALSE;      return FALSE;
3184      }      }
3185    
3186      /* In principle, support for Unicode properties should be integrated here as
3187      well. It means re-organizing the above code so as to get hold of the property
3188      values before switching on the op-code. However, I wonder how many patterns
3189      combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3190      these op-codes are never generated.) */
3191    
3192    case OP_DIGIT:    case OP_DIGIT:
3193    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||    return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3194           next == -ESC_h || next == -ESC_v;           next == -ESC_h || next == -ESC_v || next == -ESC_R;
3195    
3196    case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
3197    return next == -ESC_d;    return next == -ESC_d;
3198    
3199    case OP_WHITESPACE:    case OP_WHITESPACE:
3200    return next == -ESC_S || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3201    
3202    case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
3203    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3204    
3205    case OP_HSPACE:    case OP_HSPACE:
3206    return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3207             next == -ESC_w || next == -ESC_v || next == -ESC_R;
3208    
3209    case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
3210    return next == -ESC_h;    return next == -ESC_h;
3211    
3212    /* Can't have \S in here because VT matches \S (Perl anomaly) */    /* Can't have \S in here because VT matches \S (Perl anomaly) */
3213      case OP_ANYNL:
3214    case OP_VSPACE:    case OP_VSPACE:
3215    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3216    
3217    case OP_NOT_VSPACE:    case OP_NOT_VSPACE:
3218    return next == -ESC_v;    return next == -ESC_v || next == -ESC_R;
3219    
3220    case OP_WORDCHAR:    case OP_WORDCHAR:
3221    return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;    return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3222             next == -ESC_v || next == -ESC_R;
3223    
3224    case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
3225    return next == -ESC_w || next == -ESC_d;    return next == -ESC_w || next == -ESC_d;
# Line 2168  Arguments: Line 3251  Arguments:
3251    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3252    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
3253    bcptr          points to current branch chain    bcptr          points to current branch chain
3254      cond_depth     conditional nesting depth
3255    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
3256    lengthptr      NULL during the real compile phase    lengthptr      NULL during the real compile phase
3257                   points to length accumulator during pre-compile phase                   points to length accumulator during pre-compile phase
# Line 2179  Returns: TRUE on success Line 3263  Returns: TRUE on success
3263  static BOOL  static BOOL
3264  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3265    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3266    compile_data *cd, int *lengthptr)    int cond_depth, compile_data *cd, int *lengthptr)
3267  {  {
3268  int repeat_type, op_type;  int repeat_type, op_type;
3269  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 2188  int greedy_default, greedy_non_default; Line 3272  int greedy_default, greedy_non_default;
3272  int firstbyte, reqbyte;  int firstbyte, reqbyte;
3273  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
3274  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
3275  int options = *optionsptr;  int options = *optionsptr;               /* May change dynamically */
3276  int after_manual_callout = 0;  int after_manual_callout = 0;
3277  int length_prevgroup = 0;  int length_prevgroup = 0;
3278  register int c;  register int c;
# Line 2200  BOOL inescq = FALSE; Line 3284  BOOL inescq = FALSE;
3284  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
3285  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
3286  const uschar *tempptr;  const uschar *tempptr;
3287    const uschar *nestptr = NULL;
3288  uschar *previous = NULL;  uschar *previous = NULL;
3289  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
3290  uschar *save_hwm = NULL;  uschar *save_hwm = NULL;
3291  uschar classbits[32];  uschar classbits[32];
3292    
3293    /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3294    must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3295    dynamically as we process the pattern. */
3296    
3297  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3298  BOOL class_utf8;  BOOL class_utf8;
3299  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
3300  uschar *class_utf8data;  uschar *class_utf8data;
3301    uschar *class_utf8data_base;
3302  uschar utf8_char[6];  uschar utf8_char[6];
3303  #else  #else
3304  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
 uschar *utf8_char = NULL;  
3305  #endif  #endif
3306    
3307  #ifdef DEBUG  #ifdef PCRE_DEBUG
3308  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3309  #endif  #endif
3310    
# Line 2248  req_caseopt = ((options & PCRE_CASELESS) Line 3337  req_caseopt = ((options & PCRE_CASELESS)
3337  for (;; ptr++)  for (;; ptr++)
3338    {    {
3339    BOOL negate_class;    BOOL negate_class;
3340      BOOL should_flip_negation;
3341    BOOL possessive_quantifier;    BOOL possessive_quantifier;
3342    BOOL is_quantifier;    BOOL is_quantifier;
3343    BOOL is_recurse;    BOOL is_recurse;
# Line 2262  for (;; ptr++) Line 3352  for (;; ptr++)
3352    int subfirstbyte;    int subfirstbyte;
3353    int terminator;    int terminator;
3354    int mclength;    int mclength;
3355      int tempbracount;
3356    uschar mcbuffer[8];    uschar mcbuffer[8];
3357    
3358    /* Get next byte in the pattern */    /* Get next byte in the pattern */
3359    
3360    c = *ptr;    c = *ptr;
3361    
3362      /* If we are at the end of a nested substitution, revert to the outer level
3363      string. Nesting only happens one level deep. */
3364    
3365      if (c == 0 && nestptr != NULL)
3366        {
3367        ptr = nestptr;
3368        nestptr = NULL;
3369        c = *ptr;
3370        }
3371    
3372    /* If we are in the pre-compile phase, accumulate the length used for the    /* If we are in the pre-compile phase, accumulate the length used for the
3373    previous cycle of this loop. */    previous cycle of this loop. */
3374    
3375    if (lengthptr != NULL)    if (lengthptr != NULL)
3376      {      {
3377  #ifdef DEBUG  #ifdef PCRE_DEBUG
3378      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3379  #endif  #endif
3380      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + cd->workspace_size -
3381            WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3382        {        {
3383        *errorcodeptr = ERR52;        *errorcodeptr = ERR52;
3384        goto FAILED;        goto FAILED;
# Line 2298  for (;; ptr++) Line 3400  for (;; ptr++)
3400        goto FAILED;        goto FAILED;
3401        }        }
3402    
3403      *lengthptr += code - last_code;      *lengthptr += (int)(code - last_code);
3404      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3405          c));
3406    
3407      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3408      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 2325  for (;; ptr++) Line 3428  for (;; ptr++)
3428    /* In the real compile phase, just check the workspace used by the forward    /* In the real compile phase, just check the workspace used by the forward
3429    reference list. */    reference list. */
3430    
3431    else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)    else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3432               WORK_SIZE_SAFETY_MARGIN)
3433      {      {
3434      *errorcodeptr = ERR52;      *errorcodeptr = ERR52;
3435      goto FAILED;      goto FAILED;
# Line 2335  for (;; ptr++) Line 3439  for (;; ptr++)
3439    
3440    if (inescq && c != 0)    if (inescq && c != 0)
3441      {      {
3442      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3443        {        {
3444        inescq = FALSE;        inescq = FALSE;
3445        ptr++;        ptr++;
# Line 2361  for (;; ptr++) Line 3465  for (;; ptr++)
3465    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
3466    a quantifier. */    a quantifier. */
3467    
3468    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
3469      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3470        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3471    
3472    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
3473         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2372  for (;; ptr++) Line 3477  for (;; ptr++)
3477      previous_callout = NULL;      previous_callout = NULL;
3478      }      }
3479    
3480    /* In extended mode, skip white space and comments */    /* In extended mode, skip white space and comments. */
3481    
3482    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3483      {      {
3484      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
3485      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
3486        {        {
3487        while (*(++ptr) != 0)        ptr++;
3488          while (*ptr != 0)
3489          {          {
3490          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3491            ptr++;
3492    #ifdef SUPPORT_UTF8
3493            if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3494    #endif
3495          }          }
3496        if (*ptr != 0) continue;        if (*ptr != 0) continue;
3497    
# Line 2402  for (;; ptr++) Line 3512  for (;; ptr++)
3512      {      {
3513      /* ===================================================================*/      /* ===================================================================*/
3514      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
3515      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
3516      case ')':      case CHAR_RIGHT_PARENTHESIS:
3517      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
3518      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
3519      *codeptr = code;      *codeptr = code;
# Line 2415  for (;; ptr++) Line 3525  for (;; ptr++)
3525          *errorcodeptr = ERR20;          *errorcodeptr = ERR20;
3526          goto FAILED;          goto FAILED;
3527          }          }
3528        *lengthptr += code - last_code;   /* To include callout length */        *lengthptr += (int)(code - last_code);   /* To include callout length */
3529        DPRINTF((">> end branch\n"));        DPRINTF((">> end branch\n"));
3530        }        }
3531      return TRUE;      return TRUE;
# Line 2425  for (;; ptr++) Line 3535  for (;; ptr++)
3535      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
3536      the setting of any following char as a first character. */      the setting of any following char as a first character. */
3537    
3538      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
3539        previous = NULL;
3540      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
3541        {        {
3542        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3543          *code++ = OP_CIRCM;
3544        }        }
3545      previous = NULL;      else *code++ = OP_CIRC;
     *code++ = OP_CIRC;  
3546      break;      break;
3547    
3548      case '$':      case CHAR_DOLLAR_SIGN:
3549      previous = NULL;      previous = NULL;
3550      *code++ = OP_DOLL;      *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3551      break;      break;
3552    
3553      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
3554      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
3555    
3556      case '.':      case CHAR_DOT:
3557      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3558      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
3559      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3560      previous = code;      previous = code;
3561      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3562      break;      break;
3563    
3564    
# Line 2462  for (;; ptr++) Line 3573  for (;; ptr++)
3573      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
3574      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
3575      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
3576    
3577      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
3578        default (Perl) mode, it is treated as a data character. */
3579    
3580        case CHAR_RIGHT_SQUARE_BRACKET:
3581        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3582          {
3583          *errorcodeptr = ERR64;
3584          goto FAILED;
3585          }
3586        goto NORMAL_CHAR;
3587    
3588        case CHAR_LEFT_SQUARE_BRACKET:
3589      previous = code;      previous = code;
3590    
3591      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3592      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
3593    
3594      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3595          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
3596            check_posix_syntax(ptr, &tempptr))
3597        {        {
3598        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3599        goto FAILED;        goto FAILED;
3600        }        }
3601    
# Line 2485  for (;; ptr++) Line 3607  for (;; ptr++)
3607      for (;;)      for (;;)
3608        {        {
3609        c = *(++ptr);        c = *(++ptr);
3610        if (c == '\\')        if (c == CHAR_BACKSLASH)
3611          {          {
3612          if (ptr[1] == 'E') ptr++;          if (ptr[1] == CHAR_E)
3613            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr++;
3614              else break;          else if (strncmp((const char *)ptr+1,
3615                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
3616              ptr += 3;
3617            else
3618              break;
3619          }          }
3620        else if (!negate_class && c == '^')        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3621          negate_class = TRUE;          negate_class = TRUE;
3622        else break;        else break;
3623        }        }
3624    
3625        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3626        an initial ']' is taken as a data character -- the code below handles
3627        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3628        [^] must match any character, so generate OP_ALLANY. */
3629    
3630        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3631            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3632          {
3633          *code++ = negate_class? OP_ALLANY : OP_FAIL;
3634          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3635          zerofirstbyte = firstbyte;
3636          break;
3637          }
3638    
3639        /* If a class contains a negative special such as \S, we need to flip the
3640        negation flag at the end, so that support for characters > 255 works
3641        correctly (they are all included in the class). */
3642    
3643        should_flip_negation = FALSE;
3644    
3645      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
3646      of just a single character (as long as it's < 256). However, For higher      of just a single character (as long as it's < 256). However, For higher
3647      valued UTF-8 characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
# Line 2513  for (;; ptr++) Line 3659  for (;; ptr++)
3659  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3660      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
3661      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
3662        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
3663  #endif  #endif
3664    
3665      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2528  for (;; ptr++) Line 3675  for (;; ptr++)
3675          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3676          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3677          }          }
3678    
3679          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3680          data and reset the pointer. This is so that very large classes that
3681          contain a zillion UTF-8 characters no longer overwrite the work space
3682          (which is on the stack). */
3683    
3684          if (lengthptr != NULL)
3685            {
3686            *lengthptr += (int)(class_utf8data - class_utf8data_base);
3687            class_utf8data = class_utf8data_base;
3688            }
3689    
3690  #endif  #endif
3691    
3692        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
3693    
3694        if (inescq)        if (inescq)
3695          {          {
3696          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3697            {            {
3698            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
3699            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2549  for (;; ptr++) Line 3708  for (;; ptr++)
3708        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3709        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3710    
3711        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3712            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3713            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3714          {          {
3715          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3716          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3717          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
3718          uschar pbits[32];          uschar pbits[32];
3719    
3720          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3721            {            {
3722            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3723            goto FAILED;            goto FAILED;
3724            }            }
3725    
3726          ptr += 2;          ptr += 2;
3727          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3728            {            {
3729            local_negate = TRUE;            local_negate = TRUE;
3730              should_flip_negation = TRUE;  /* Note negative special */
3731            ptr++;            ptr++;
3732            }            }
3733    
3734          posix_class = check_posix_name(ptr, tempptr - ptr);          posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3735          if (posix_class < 0)          if (posix_class < 0)
3736            {            {
3737            *errorcodeptr = ERR30;            *errorcodeptr = ERR30;
# Line 2585  for (;; ptr++) Line 3745  for (;; ptr++)
3745          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3746            posix_class = 0;            posix_class = 0;
3747    
3748          /* We build the bit map for the POSIX class in a chunk of local store          /* When PCRE_UCP is set, some of the POSIX classes are converted to
3749          because we may be adding and subtracting from it, and we don't want to          different escape sequences that use Unicode properties. */
3750          subtract bits that may be in the main map already. At the end we or the  
3751          result into the bit map that is being built. */  #ifdef SUPPORT_UCP
3752            if ((options & PCRE_UCP) != 0)
3753              {
3754              int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3755              if (posix_substitutes[pc] != NULL)
3756                {
3757                nestptr = tempptr + 1;
3758                ptr = posix_substitutes[pc] - 1;
3759                continue;
3760                }
3761              }
3762    #endif
3763            /* In the non-UCP case, we build the bit map for the POSIX class in a
3764            chunk of local store because we may be adding and subtracting from it,
3765            and we don't want to subtract bits that may be in the main map already.
3766            At the end we or the result into the bit map that is being built. */
3767    
3768          posix_class *= 3;          posix_class *= 3;
3769    
# Line 2632  for (;; ptr++) Line 3807  for (;; ptr++)
3807    
3808        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3809        of the specials, which just set a flag. The sequence \b is a special        of the specials, which just set a flag. The sequence \b is a special
3810        case. Inside a class (and only there) it is treated as backspace.        case. Inside a class (and only there) it is treated as backspace. We
3811        Elsewhere it marks a word boundary. Other escapes have preset maps ready        assume that other escapes have more than one character in them, so set
3812        to 'or' into the one we are building. We assume they have more than one        class_charcount bigger than one. Unrecognized escapes fall through and
3813        character in them, so set class_charcount bigger than one. */        are either treated as literal characters (by default), or are faulted if
3814          PCRE_EXTRA is set. */
3815    
3816        if (c == '\\')        if (c == CHAR_BACKSLASH)
3817          {          {
3818          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3819          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3820    
3821          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3822          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_N)            /* \N is not supported in a class */
3823          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */            {
3824              *errorcodeptr = ERR71;
3825              goto FAILED;
3826              }
3827          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3828            {            {
3829            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3830              {              {
3831              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3832              }              }
3833            else inescq = TRUE;            else inescq = TRUE;
3834            continue;            continue;
3835            }            }
3836            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3837    
3838          if (c < 0)          if (c < 0)
3839            {            {
3840            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
3841            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3842    
3843            /* Save time by not doing this in the pre-compile phase. */            switch (-c)
   
           if (lengthptr == NULL) switch (-c)  
3844              {              {
3845    #ifdef SUPPORT_UCP
3846                case ESC_du:     /* These are the values given for \d etc */
3847                case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3848                case ESC_wu:     /* escape sequence with an appropriate \p */
3849                case ESC_WU:     /* or \P to test Unicode properties instead */
3850                case ESC_su:     /* of the default ASCII testing. */
3851                case ESC_SU:
3852                nestptr = ptr;
3853                ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3854                class_charcount -= 2;                /* Undo! */
3855                continue;
3856    #endif
3857              case ESC_d:              case ESC_d:
3858              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3859              continue;              continue;
3860    
3861              case ESC_D:              case ESC_D:
3862                should_flip_negation = TRUE;
3863              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3864              continue;              continue;
3865    
# Line 2677  for (;; ptr++) Line 3868  for (;; ptr++)
3868              continue;              continue;
3869    
3870              case ESC_W:              case ESC_W:
3871                should_flip_negation = TRUE;
3872              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3873              continue;              continue;
3874    
3875                /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3876                if it was previously set by something earlier in the character
3877                class. */
3878    
3879              case ESC_s:              case ESC_s:
3880              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];              classbits[0] |= cbits[cbit_space];
3881              classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= cbits[cbit_space+1] & ~0x08;
3882                for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3883              continue;              continue;
3884    
3885              case ESC_S:              case ESC_S:
3886                should_flip_negation = TRUE;
3887              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3888              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3889              continue;              continue;
3890    
3891              case ESC_E: /* Perl ignores an orphan \E */              case ESC_h:
             continue;  
   
             default:    /* Not recognized; fall through */  
             break;      /* Need "default" setting to stop compiler warning. */  
             }  
   
           /* In the pre-compile phase, just do the recognition. */  
   
           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||  
                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;  
   
           /* We need to deal with \H, \h, \V, and \v in both phases because  
           they use extra memory. */  
   
           if (-c == ESC_h)  
             {  
3892              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x09); /* VT */
3893              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0x20); /* SPACE */
3894              SETBIT(classbits, 0xa0); /* NSBP */              SETBIT(classbits, 0xa0); /* NSBP */
# Line 2730  for (;; ptr++) Line 3912  for (;; ptr++)
3912                }                }
3913  #endif  #endif
3914              continue;              continue;
             }  
3915    
3916            if (-c == ESC_H)              case ESC_H:
             {  
3917              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
3918                {                {
3919                int x = 0xff;                int x = 0xff;
# Line 2775  for (;; ptr++) Line 3955  for (;; ptr++)
3955                }                }
3956  #endif  #endif
3957              continue;              continue;
             }  
3958    
3959            if (-c == ESC_v)              case ESC_v:
             {  
3960              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, 0x0a); /* LF */
3961              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, 0x0b); /* VT */
3962              SETBIT(classbits, 0x0c); /* FF */              SETBIT(classbits, 0x0c); /* FF */
# Line 2794  for (;; ptr++) Line 3972  for (;; ptr++)
3972                }                }
3973  #endif  #endif
3974              continue;              continue;
             }  
3975    
3976            if (-c == ESC_V)              case ESC_V:
             {  
3977              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++)
3978                {                {
3979                int x = 0xff;                int x = 0xff;
# Line 2827  for (;; ptr++) Line 4003  for (;; ptr++)
4003                }                }
4004  #endif  #endif
4005              continue;              continue;
             }  
   
           /* We need to deal with \P and \p in both phases. */  
4006    
4007  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4008            if (-c == ESC_p || -c == ESC_P)              case ESC_p:
4009              {              case ESC_P:
4010              BOOL negated;                {
4011              int pdata;                BOOL negated;
4012              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);                int pdata;
4013              if (ptype < 0) goto FAILED;                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4014              class_utf8 = TRUE;                if (ptype < 0) goto FAILED;
4015              *class_utf8data++ = ((-c == ESC_p) != negated)?                class_utf8 = TRUE;
4016                XCL_PROP : XCL_NOTPROP;                *class_utf8data++ = ((-c == ESC_p) != negated)?
4017              *class_utf8data++ = ptype;                  XCL_PROP : XCL_NOTPROP;
4018              *class_utf8data++ = pdata;                *class_utf8data++ = ptype;
4019              class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = pdata;
4020              continue;                class_charcount -= 2;   /* Not a < 256 character */
4021              }                continue;
4022                  }
4023  #endif  #endif
4024            /* Unrecognized escapes are faulted if PCRE is running in its              /* Unrecognized escapes are faulted if PCRE is running in its
4025            strict mode. By default, for compatibility with Perl, they are              strict mode. By default, for compatibility with Perl, they are
4026            treated as literals. */              treated as literals. */
4027    
4028            if ((options & PCRE_EXTRA) != 0)              default:
4029              {              if ((options & PCRE_EXTRA) != 0)
4030              *errorcodeptr = ERR7;                {
4031              goto FAILED;                *errorcodeptr = ERR7;
4032                  goto FAILED;
4033                  }
4034                class_charcount -= 2;  /* Undo the default count from above */
4035                c = *ptr;              /* Get the final character and fall through */
4036                break;
4037              }              }
   
           class_charcount -= 2;  /* Undo the default count from above */  
           c = *ptr;              /* Get the final character and fall through */  
4038            }            }
4039    
4040          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
# Line 2872  for (;; ptr++) Line 4048  for (;; ptr++)
4048        entirely. The code for handling \Q and \E is messy. */        entirely. The code for handling \Q and \E is messy. */
4049    
4050        CHECK_RANGE:        CHECK_RANGE:
4051        while (ptr[1] == '\\' && ptr[2] == 'E')        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4052          {          {
4053          inescq = FALSE;          inescq = FALSE;
4054          ptr += 2;          ptr += 2;
# Line 2880  for (;; ptr++) Line 4056  for (;; ptr++)
4056    
4057        oldptr = ptr;        oldptr = ptr;
4058    
4059        if (!inescq && ptr[1] == '-')        /* Remember \r or \n */
4060    
4061          if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4062    
4063          /* Check for range */
4064    
4065          if (!inescq && ptr[1] == CHAR_MINUS)
4066          {          {
4067          int d;          int d;
4068          ptr += 2;          ptr += 2;
4069          while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4070    
4071          /* If we hit \Q (not followed by \E) at this point, go into escaped          /* If we hit \Q (not followed by \E) at this point, go into escaped
4072          mode. */          mode. */
4073    
4074          while (*ptr == '\\' && ptr[1] == 'Q')          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4075            {            {
4076            ptr += 2;            ptr += 2;
4077            if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4078                { ptr += 2; continue; }
4079            inescq = TRUE;            inescq = TRUE;
4080            break;            break;
4081            }            }
4082    
4083          if (*ptr == 0 || (!inescq && *ptr == ']'))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4084            {            {
4085            ptr = oldptr;            ptr = oldptr;
4086            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
# Line 2916  for (;; ptr++) Line 4099  for (;; ptr++)
4099          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4100          in such circumstances. */          in such circumstances. */
4101    
4102          if (!inescq && d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
4103            {            {
4104            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4105            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
4106    
4107            /* \b is backslash; \X is literal X; \R is literal R; any other            /* \b is backspace; any other special means the '-' was literal */
           special means the '-' was literal */  
4108    
4109            if (d < 0)            if (d < 0)
4110              {              {
4111              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS; else
             else if (d == -ESC_X) d = 'X';  
             else if (d == -ESC_R) d = 'R'; else  
4112                {                {
4113                ptr = oldptr;                ptr = oldptr;
4114                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 2947  for (;; ptr++) Line 4127  for (;; ptr++)
4127    
4128          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4129    
4130            /* Remember \r or \n */
4131    
4132            if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4133    
4134          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4135          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
4136          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 3065  for (;; ptr++) Line 4249  for (;; ptr++)
4249          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
4250            {            {
4251            unsigned int othercase;            unsigned int othercase;
4252            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)            if ((othercase = UCD_OTHERCASE(c)) != c)
4253              {              {
4254              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
4255              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 3090  for (;; ptr++) Line 4274  for (;; ptr++)
4274          }          }
4275        }        }
4276    
4277      /* Loop until ']' reached. This "while" is the end of the "do" above. */      /* Loop until ']' reached. This "while" is the end of the "do" far above.
4278        If we are at the end of an internal nested string, revert to the outer
4279        string. */
4280    
4281        while (((c = *(++ptr)) != 0 ||
4282               (nestptr != NULL &&
4283                 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4284               (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4285    
4286      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));      /* Check for missing terminating ']' */
4287    
4288      if (c == 0)                          /* Missing terminating ']' */      if (c == 0)
4289        {        {
4290        *errorcodeptr = ERR6;        *errorcodeptr = ERR6;
4291        goto FAILED;        goto FAILED;
4292        }        }
4293    
4294      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
4295      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
4296      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
4297      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
4298      single-bytes only. This is an historical hangover. Maybe one day we can  
4299      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
4300        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4301        operate on single-bytes characters only. This is an historical hangover.
4302        Maybe one day we can tidy these opcodes to handle multi-byte characters.
4303    
4304      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
4305      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4306      that OP_NOT does not support multibyte characters. In the positive case, it      Note that OP_NOT[I] does not support multibyte characters. In the positive
4307      can cause firstbyte to be set. Otherwise, there can be no first char if      case, it can cause firstbyte to be set. Otherwise, there can be no first
4308      this item is first, whatever repeat count may follow. In the case of      char if this item is first, whatever repeat count may follow. In the case
4309      reqbyte, save the previous value for reinstating. */      of reqbyte, save the previous value for reinstating. */
4310    
4311  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4312      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
4313            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
4314  #else  #else
4315      if (class_charcount == 1)      if (class_charcount == 1)
4316  #endif  #endif
4317        {        {
4318        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4319    
4320        /* The OP_NOT opcode works on one-byte characters only. */        /* The OP_NOT[I] opcodes work on one-byte characters only. */
4321    
4322        if (negate_class)        if (negate_class)
4323          {          {
4324          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4325          zerofirstbyte = firstbyte;          zerofirstbyte = firstbyte;
4326          *code++ = OP_NOT;          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4327          *code++ = class_lastchar;          *code++ = class_lastchar;
4328          break;          break;
4329          }          }
# Line 3161  for (;; ptr++) Line 4353  for (;; ptr++)
4353      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4354    
4355      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
4356      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
4357      we can omit the bitmap in the actual compiled code. */      such as \S in the class, and PCRE_UCP is not set, because in that case all
4358        characters > 255 are in the class, so any that were explicitly given as
4359        well can be ignored. If (when there are explicit characters > 255 that must
4360        be listed) there are no characters < 256, we can omit the bitmap in the
4361        actual compiled code. */
4362    
4363  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4364      if (class_utf8)      if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4365        {        {
4366        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
4367        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
# Line 3186  for (;; ptr++) Line 4382  for (;; ptr++)
4382    
4383        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
4384    
4385        PUT(previous, 1, code - previous);        PUT(previous, 1, (int)(code - previous));
4386        break;   /* End of class handling */        break;   /* End of class handling */
4387        }        }
4388  #endif  #endif
4389    
4390      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, or they are all to be included or
4391      and copy it into the code vector. If this is the first thing in the branch,      excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4392      there can be no first char setting, whatever the repeat count. Any reqbyte      whole class was negated and whether there were negative specials such as \S
4393      setting must remain unchanged after any kind of repeat. */      (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4394        negating it if necessary. */
4395    
4396        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4397      if (negate_class)      if (negate_class)
4398        {        {
       *code++ = OP_NCLASS;  
4399        if (lengthptr == NULL)    /* Save time in the pre-compile phase */        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4400          for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];