/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 221 by ph10, Fri Aug 17 09:25:08 2007 UTC revision 408 by ph10, Fri Mar 27 17:52:03 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 43  supporting internal functions that are n Line 43  supporting internal functions that are n
43    
44    
45  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
46  #include <config.h>  #include "config.h"
47  #endif  #endif
48    
49  #define NLBLOCK cd             /* Block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
# Line 97  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
101    
102    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103    in UTF-8 mode. */
104    
105  static const short int escapes[] = {  static const short int escapes[] = {
106       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
107       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
108     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
109  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
110  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
111  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
112     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
113  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
114  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
115       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
116         -ESC_D,                  -ESC_E,
117         0,                       -ESC_G,
118         -ESC_H,                  0,
119         0,                       -ESC_K,
120         0,                       0,
121         0,                       0,
122         -ESC_P,                  -ESC_Q,
123         -ESC_R,                  -ESC_S,
124         0,                       0,
125         -ESC_V,                  -ESC_W,
126         -ESC_X,                  0,
127         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
128         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
129         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
130         CHAR_GRAVE_ACCENT,       7,
131         -ESC_b,                  0,
132         -ESC_d,                  ESC_e,
133         ESC_f,                   0,
134         -ESC_h,                  0,
135         0,                       -ESC_k,
136         0,                       0,
137         ESC_n,                   0,
138         -ESC_p,                  0,
139         ESC_r,                   -ESC_s,
140         ESC_tee,                 0,
141         -ESC_v,                  -ESC_w,
142         0,                       0,
143         -ESC_z
144  };  };
145    
146  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
147    
148    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150  static const short int escapes[] = {  static const short int escapes[] = {
151  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
152  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 140  static const short int escapes[] = { Line 175  static const short int escapes[] = {
175  #endif  #endif
176    
177    
178  /* Table of special "verbs" like (*PRUNE) */  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179    searched linearly. Put all the names into a single string, in order to reduce
180    the number of relocations when a shared library is dynamically linked. The
181    string is built from string macros so that it works in UTF-8 mode on EBCDIC
182    platforms. */
183    
184  typedef struct verbitem {  typedef struct verbitem {
   const char *name;  
185    int   len;    int   len;
186    int   op;    int   op;
187  } verbitem;  } verbitem;
188    
189  static verbitem verbs[] = {  static const char verbnames[] =
190    { "ACCEPT", 6, OP_ACCEPT },    STRING_ACCEPT0
191    { "COMMIT", 6, OP_COMMIT },    STRING_COMMIT0
192    { "F",      1, OP_FAIL },    STRING_F0
193    { "FAIL",   4, OP_FAIL },    STRING_FAIL0
194    { "PRUNE",  5, OP_PRUNE },    STRING_PRUNE0
195    { "SKIP",   4, OP_SKIP  },    STRING_SKIP0
196    { "THEN",   4, OP_THEN  }    STRING_THEN;
197    
198    static const verbitem verbs[] = {
199      { 6, OP_ACCEPT },
200      { 6, OP_COMMIT },
201      { 1, OP_FAIL },
202      { 4, OP_FAIL },
203      { 5, OP_PRUNE },
204      { 4, OP_SKIP  },
205      { 4, OP_THEN  }
206  };  };
207    
208  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
   
209    
 /* Tables of names of POSIX character classes and their lengths. The list is  
 terminated by a zero length entry. The first three must be alpha, lower, upper,  
 as this is assumed for handling case independence. */  
210    
211  static const char *const posix_names[] = {  /* Tables of names of POSIX character classes and their lengths. The names are
212    "alpha", "lower", "upper",  now all in a single string, to reduce the number of relocations when a shared
213    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  library is dynamically loaded. The list of lengths is terminated by a zero
214    "print", "punct", "space", "word",  "xdigit" };  length entry. The first three must be alpha, lower, upper, as this is assumed
215    for handling case independence. */
216    
217    static const char posix_names[] =
218      STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221      STRING_word0  STRING_xdigit;
222    
223  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
224    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 207  static const int posix_class_maps[] = { Line 257  static const int posix_class_maps[] = {
257  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
258  are passed to the outside world. Do not ever re-use any error number, because  are passed to the outside world. Do not ever re-use any error number, because
259  they are documented. Always add a new error instead. Messages marked DEAD below  they are documented. Always add a new error instead. Messages marked DEAD below
260  are no longer used. */  are no longer used. This used to be a table of strings, but in order to reduce
261    the number of relocations needed when a shared library is loaded dynamically,
262  static const char *error_texts[] = {  it is now one long string. We cannot use a table of offsets, because the
263    "no error",  lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264    "\\ at end of pattern",  simply count through to the one we want - this isn't a performance issue
265    "\\c at end of pattern",  because these strings are used only when there is a compilation error. */
266    "unrecognized character follows \\",  
267    "numbers out of order in {} quantifier",  static const char error_texts[] =
268      "no error\0"
269      "\\ at end of pattern\0"
270      "\\c at end of pattern\0"
271      "unrecognized character follows \\\0"
272      "numbers out of order in {} quantifier\0"
273    /* 5 */    /* 5 */
274    "number too big in {} quantifier",    "number too big in {} quantifier\0"
275    "missing terminating ] for character class",    "missing terminating ] for character class\0"
276    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
277    "range out of order in character class",    "range out of order in character class\0"
278    "nothing to repeat",    "nothing to repeat\0"
279    /* 10 */    /* 10 */
280    "operand of unlimited repeat could match the empty string",  /** DEAD **/    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
281    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
282    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
283    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
284    "missing )",    "missing )\0"
285    /* 15 */    /* 15 */
286    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
287    "erroffset passed as NULL",    "erroffset passed as NULL\0"
288    "unknown option bit(s) set",    "unknown option bit(s) set\0"
289    "missing ) after comment",    "missing ) after comment\0"
290    "parentheses nested too deeply",  /** DEAD **/    "parentheses nested too deeply\0"  /** DEAD **/
291    /* 20 */    /* 20 */
292    "regular expression is too large",    "regular expression is too large\0"
293    "failed to get memory",    "failed to get memory\0"
294    "unmatched parentheses",    "unmatched parentheses\0"
295    "internal error: code overflow",    "internal error: code overflow\0"
296    "unrecognized character after (?<",    "unrecognized character after (?<\0"
297    /* 25 */    /* 25 */
298    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
299    "malformed number or name after (?(",    "malformed number or name after (?(\0"
300    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
301    "assertion expected after (?(",    "assertion expected after (?(\0"
302    "(?R or (?[+-]digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
303    /* 30 */    /* 30 */
304    "unknown POSIX class name",    "unknown POSIX class name\0"
305    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
306    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307    "spare error",  /** DEAD **/    "spare error\0"  /** DEAD **/
308    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
309    /* 35 */    /* 35 */
310    "invalid condition (?(0)",    "invalid condition (?(0)\0"
311    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
312    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313    "number after (?C is > 255",    "number after (?C is > 255\0"
314    "closing ) for (?C expected",    "closing ) for (?C expected\0"
315    /* 40 */    /* 40 */
316    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
317    "unrecognized character after (?P",    "unrecognized character after (?P\0"
318    "syntax error in subpattern name (missing terminator)",    "syntax error in subpattern name (missing terminator)\0"
319    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
320    "invalid UTF-8 string",    "invalid UTF-8 string\0"
321    /* 45 */    /* 45 */
322    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
323    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
324    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
325    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327    /* 50 */    /* 50 */
328    "repeated subpattern is too long",    /** DEAD **/    "repeated subpattern is too long\0"    /** DEAD **/
329    "octal value is greater than \\377 (not in UTF-8 mode)",    "octal value is greater than \\377 (not in UTF-8 mode)\0"
330    "internal error: overran compiling workspace",    "internal error: overran compiling workspace\0"
331    "internal error: previously-checked referenced subpattern not found",    "internal error: previously-checked referenced subpattern not found\0"
332    "DEFINE group contains more than one branch",    "DEFINE group contains more than one branch\0"
333    /* 55 */    /* 55 */
334    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed\0"
335    "inconsistent NEWLINE options",    "inconsistent NEWLINE options\0"
336    "\\g is not followed by a braced name or an optionally braced non-zero number",    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",    "a numbered reference must not be zero\0"
338    "(*VERB) with an argument is not supported",    "(*VERB) with an argument is not supported\0"
339    /* 60 */    /* 60 */
340    "(*VERB) not recognized",    "(*VERB) not recognized\0"
341    "number is too big"    "number is too big\0"
342  };    "subpattern name expected\0"
343      "digit expected after (?+\0"
344      "] is an invalid data character in JavaScript compatibility mode";
345    
346    
347  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 303  For convenience, we use the same bit def Line 360  For convenience, we use the same bit def
360    
361  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
364    
365    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366    UTF-8 mode. */
367    
368  static const unsigned char digitab[] =  static const unsigned char digitab[] =
369    {    {
370    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 339  static const unsigned char digitab[] = Line 400  static const unsigned char digitab[] =
400    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
404    
405    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407  static const unsigned char digitab[] =  static const unsigned char digitab[] =
408    {    {
409    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 420  static BOOL Line 484  static BOOL
484    
485    
486  /*************************************************  /*************************************************
487    *            Find an error text                  *
488    *************************************************/
489    
490    /* The error texts are now all in one long string, to save on relocations. As
491    some of the text is of unknown length, we can't use a table of offsets.
492    Instead, just count through the strings. This is not a performance issue
493    because it happens only when there has been a compilation error.
494    
495    Argument:   the error number
496    Returns:    pointer to the error string
497    */
498    
499    static const char *
500    find_error_text(int n)
501    {
502    const char *s = error_texts;
503    for (; n > 0; n--) while (*s++ != 0) {};
504    return s;
505    }
506    
507    
508    /*************************************************
509  *            Handle escapes                      *  *            Handle escapes                      *
510  *************************************************/  *************************************************/
511    
# Line 458  ptr--; /* Set Line 544  ptr--; /* Set
544    
545  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
546    
547  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
549  Otherwise further processing may be required. */  Otherwise further processing may be required. */
550    
551  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
552  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
553  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554    
555  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
556  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
557  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
558  #endif  #endif
559    
# Line 483  else Line 569  else
569      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
570      error. */      error. */
571    
572      case 'l':      case CHAR_l:
573      case 'L':      case CHAR_L:
574      case 'N':      case CHAR_N:
575      case 'u':      case CHAR_u:
576      case 'U':      case CHAR_U:
577      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
578      break;      break;
579    
580      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by one of a number of specific things:
581      is an absolute backreference. If negative, it is a relative backreference.  
582      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a      (1) A number, either plain or braced. If positive, it is an absolute
583      reference to a named group. This is part of Perl's movement towards a      backreference. If negative, it is a relative backreference. This is a Perl
584      unified syntax for back references. As this is synonymous with \k{name}, we      5.10 feature.
585      fudge it up by pretending it really was \k. */  
586        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587        is part of Perl's movement towards a unified syntax for back references. As
588        this is synonymous with \k{name}, we fudge it up by pretending it really
589        was \k.
590    
591        (3) For Oniguruma compatibility we also support \g followed by a name or a
592        number either in angle brackets or in single quotes. However, these are
593        (possibly recursive) subroutine calls, _not_ backreferences. Just return
594        the -ESC_g code (cf \k). */
595    
596      case 'g':      case CHAR_g:
597      if (ptr[1] == '{')      if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598          {
599          c = -ESC_g;
600          break;
601          }
602    
603        /* Handle the Perl-compatible cases */
604    
605        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606        {        {
607        const uschar *p;        const uschar *p;
608        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610        if (*p != 0 && *p != '}')        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611          {          {
612          c = -ESC_k;          c = -ESC_k;
613          break;          break;
# Line 514  else Line 617  else
617        }        }
618      else braced = FALSE;      else braced = FALSE;
619    
620      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
621        {        {
622        negated = TRUE;        negated = TRUE;
623        ptr++;        ptr++;
# Line 523  else Line 626  else
626    
627      c = 0;      c = 0;
628      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
629        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - CHAR_0;
630    
631      if (c < 0)      if (c < 0)   /* Integer overflow */
632        {        {
633        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
634        break;        break;
635        }        }
636    
637      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638        {        {
639        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
640        break;        break;
641        }        }
642    
643        if (c == 0)
644          {
645          *errorcodeptr = ERR58;
646          break;
647          }
648    
649      if (negated)      if (negated)
650        {        {
651        if (c > bracount)        if (c > bracount)
# Line 562  else Line 671  else
671      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
672      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
673    
674      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676    
677      if (!isclass)      if (!isclass)
678        {        {
679        oldptr = ptr;        oldptr = ptr;
680        c -= '0';        c -= CHAR_0;
681        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
682          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
683        if (c < 0)        if (c < 0)    /* Integer overflow */
684          {          {
685          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
686          break;          break;
# Line 588  else Line 697  else
697      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
698      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
699    
700      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
701        {        {
702        ptr--;        ptr--;
703        c = 0;        c = 0;
# Line 601  else Line 710  else
710      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711      than 3 octal digits. */      than 3 octal digits. */
712    
713      case '0':      case CHAR_0:
714      c -= '0';      c -= CHAR_0;
715      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
717      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 255) *errorcodeptr = ERR51;
718      break;      break;
719    
# Line 612  else Line 721  else
721      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722      treated as a data character. */      treated as a data character. */
723    
724      case 'x':      case CHAR_x:
725      if (ptr[1] == '{')      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726        {        {
727        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
728        int count = 0;        int count = 0;
# Line 622  else Line 731  else
731        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
732          {          {
733          register int cc = *pt++;          register int cc = *pt++;
734          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
735          count++;          count++;
736    
737  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
738          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
739          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
741          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
742          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743  #endif  #endif
744          }          }
745    
746        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747          {          {
748          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749          ptr = pt;          ptr = pt;
# Line 650  else Line 759  else
759      c = 0;      c = 0;
760      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761        {        {
762        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
763        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
764  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
765        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
766        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
768        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
769        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770  #endif  #endif
771        }        }
772      break;      break;
# Line 666  else Line 775  else
775      This coding is ASCII-specific, but then the whole concept of \cx is      This coding is ASCII-specific, but then the whole concept of \cx is
776      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777    
778      case 'c':      case CHAR_c:
779      c = *(++ptr);      c = *(++ptr);
780      if (c == 0)      if (c == 0)
781        {        {
# Line 674  else Line 783  else
783        break;        break;
784        }        }
785    
786  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
787      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788      c ^= 0x40;      c ^= 0x40;
789  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
790      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
791      c ^= 0xC0;      c ^= 0xC0;
792  #endif  #endif
793      break;      break;
794    
795      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
797      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
798      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
799      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
800    
801      default:      default:
802      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 740  if (c == 0) goto ERROR_RETURN; Line 849  if (c == 0) goto ERROR_RETURN;
849  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850  negation. */  negation. */
851    
852  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
853    {    {
854    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855      {      {
856      *negptr = TRUE;      *negptr = TRUE;
857      ptr++;      ptr++;
# Line 751  if (c == '{') Line 860  if (c == '{')
860      {      {
861      c = *(++ptr);      c = *(++ptr);
862      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
863      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864      name[i] = c;      name[i] = c;
865      }      }
866    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867    name[i] = 0;    name[i] = 0;
868    }    }
869    
# Line 776  top = _pcre_utt_size; Line 885  top = _pcre_utt_size;
885  while (bot < top)  while (bot < top)
886    {    {
887    i = (bot + top) >> 1;    i = (bot + top) >> 1;
888    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889    if (c == 0)    if (c == 0)
890      {      {
891      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 819  is_counted_repeat(const uschar *p) Line 928  is_counted_repeat(const uschar *p)
928  {  {
929  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
931  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932    
933  if (*p++ != ',') return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
934  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935    
936  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939  return (*p == '}');  return (*p == CHAR_RIGHT_CURLY_BRACKET);
940  }  }
941    
942    
# Line 860  int max = -1; Line 969  int max = -1;
969  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
970  an integer overflow. */  an integer overflow. */
971    
972  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
974    {    {
975    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 870  if (min < 0 || min > 65535) Line 979  if (min < 0 || min > 65535)
979  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
980  Also, max must not be less than min. */  Also, max must not be less than min. */
981    
982  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983    {    {
984    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985      {      {
986      max = 0;      max = 0;
987      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
989        {        {
990        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 900  return p; Line 1009  return p;
1009    
1010    
1011  /*************************************************  /*************************************************
1012  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1013  *************************************************/  *************************************************/
1014    
1015  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1016    top-level call starts at the beginning of the pattern. All other calls must
1017    start at a parenthesis. It scans along a pattern's text looking for capturing
1018  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1019  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1020  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1021  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1022  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1023    capturing group numbers - the (?| feature.
1024    
1025  Arguments:  Arguments:
1026    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1027    count        current count of capturing parens so far encountered    cd           compile background data
1028    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1029    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1030    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1031      count        pointer to the current capturing subpattern number (updated)
1032    
1033  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1034  */  */
1035    
1036  static int  static int
1037  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038    BOOL xmode)    BOOL xmode, int *count)
1039  {  {
1040  const uschar *thisname;  uschar *ptr = *ptrptr;
1041    int start_count = *count;
1042    int hwm_count = start_count;
1043    BOOL dup_parens = FALSE;
1044    
1045  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1046    dealing with. The very first call may not start with a parenthesis. */
1047    
1048    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049    {    {
1050    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1051          ptr[2] == CHAR_VERTICAL_LINE)
1052        {
1053        ptr += 3;
1054        dup_parens = TRUE;
1055        }
1056    
1057      /* Handle a normal, unnamed capturing parenthesis */
1058    
1059      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060        {
1061        *count += 1;
1062        if (name == NULL && *count == lorn) return *count;
1063        ptr++;
1064        }
1065    
1066      /* Handle a condition. If it is an assertion, just carry on so that it
1067      is processed as normal. If not, skip to the closing parenthesis of the
1068      condition (there can't be any nested parens. */
1069    
1070      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071        {
1072        ptr += 2;
1073        if (ptr[1] != CHAR_QUESTION_MARK)
1074          {
1075          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076          if (*ptr != 0) ptr++;
1077          }
1078        }
1079    
1080      /* We have either (? or (* and not a condition */
1081    
1082      else
1083        {
1084        ptr += 2;
1085        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1086    
1087        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088    
1089        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091          {
1092          int term;
1093          const uschar *thisname;
1094          *count += 1;
1095          if (name == NULL && *count == lorn) return *count;
1096          term = *ptr++;
1097          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098          thisname = ptr;
1099          while (*ptr != term) ptr++;
1100          if (name != NULL && lorn == ptr - thisname &&
1101              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102            return *count;
1103          }
1104        }
1105      }
1106    
1107    /* Past any initial parenthesis handling, scan for parentheses or vertical
1108    bars. */
1109    
1110    for (; *ptr != 0; ptr++)
1111      {
1112    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1113    
1114    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1115      {      {
1116      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1117      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1118        {        {
1119        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1120        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1121        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1122        }        }
1123      continue;      continue;
1124      }      }
1125    
1126    /* Skip over character classes */    /* Skip over character classes; this logic must be similar to the way they
1127      are handled for real. If the first character is '^', skip it. Also, if the
1128      first few characters (either before or after ^) are \Q\E or \E we skip them
1129      too. This makes for compatibility with Perl. Note the use of STR macros to
1130      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1131    
1132    if (*ptr == '[')    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1133      {      {
1134      while (*(++ptr) != ']')      BOOL negate_class = FALSE;
1135        for (;;)
1136          {
1137          int c = *(++ptr);
1138          if (c == CHAR_BACKSLASH)
1139            {
1140            if (ptr[1] == CHAR_E)
1141              ptr++;
1142            else if (strncmp((const char *)ptr+1,
1143                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144              ptr += 3;
1145            else
1146              break;
1147            }
1148          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1149            negate_class = TRUE;
1150          else break;
1151          }
1152    
1153        /* If the next character is ']', it is a data character that must be
1154        skipped, except in JavaScript compatibility mode. */
1155    
1156        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1157            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1158          ptr++;
1159    
1160        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1161        {        {
1162        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1163        if (*ptr == '\\')        if (*ptr == CHAR_BACKSLASH)
1164          {          {
1165          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1166          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1167            {            {
1168            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1169            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1170            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1171            }            }
1172          continue;          continue;
1173          }          }
# Line 968  for (; *ptr != 0; ptr++) Line 1177  for (; *ptr != 0; ptr++)
1177    
1178    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1179    
1180    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1181      {      {
1182      while (*(++ptr) != 0 && *ptr != '\n');      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1183      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1184      continue;      continue;
1185      }      }
1186    
1187    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1188    
1189    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
1190    if (ptr[1] != '?' && ptr[1] != '*')      {
1191        int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1192        if (rc > 0) return rc;
1193        if (*ptr == 0) goto FAIL_EXIT;
1194        }
1195    
1196      else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1197      {      {
1198      count++;      if (dup_parens && *count < hwm_count) *count = hwm_count;
1199      if (name == NULL && count == lorn) return count;      *ptrptr = ptr;
1200      continue;      return -1;
1201      }      }
1202    
1203      else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1204        {
1205        if (*count > hwm_count) hwm_count = *count;
1206        *count = start_count;
1207        }
1208      }
1209    
1210    ptr += 2;  FAIL_EXIT:
1211    if (*ptr == 'P') ptr++;                      /* Allow optional P */  *ptrptr = ptr;
1212    return -1;
1213    }
1214    
   /* We have to disambiguate (?<! and (?<= from (?<name> */  
1215    
   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  
        *ptr != '\'')  
     continue;  
1216    
   count++;  
1217    
1218    if (name == NULL && count == lorn) return count;  /*************************************************
1219    term = *ptr++;  *       Find forward referenced subpattern       *
1220    if (term == '<') term = '>';  *************************************************/
   thisname = ptr;  
   while (*ptr != term) ptr++;  
   if (name != NULL && lorn == ptr - thisname &&  
       strncmp((const char *)name, (const char *)thisname, lorn) == 0)  
     return count;  
   }  
1221    
1222  return -1;  /* This function scans along a pattern's text looking for capturing
1223    subpatterns, and counting them. If it finds a named pattern that matches the
1224    name it is given, it returns its number. Alternatively, if the name is NULL, it
1225    returns when it reaches a given numbered subpattern. This is used for forward
1226    references to subpatterns. We used to be able to start this scan from the
1227    current compiling point, using the current count value from cd->bracount, and
1228    do it all in a single loop, but the addition of the possibility of duplicate
1229    subpattern numbers means that we have to scan from the very start, in order to
1230    take account of such duplicates, and to use a recursive function to keep track
1231    of the different types of group.
1232    
1233    Arguments:
1234      cd           compile background data
1235      name         name to seek, or NULL if seeking a numbered subpattern
1236      lorn         name length, or subpattern number if name is NULL
1237      xmode        TRUE if we are in /x mode
1238    
1239    Returns:       the number of the found subpattern, or -1 if not found
1240    */
1241    
1242    static int
1243    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1244    {
1245    uschar *ptr = (uschar *)cd->start_pattern;
1246    int count = 0;
1247    int rc;
1248    
1249    /* If the pattern does not start with an opening parenthesis, the first call
1250    to find_parens_sub() will scan right to the end (if necessary). However, if it
1251    does start with a parenthesis, find_parens_sub() will return when it hits the
1252    matching closing parens. That is why we have to have a loop. */
1253    
1254    for (;;)
1255      {
1256      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1257      if (rc > 0 || *ptr++ == 0) break;
1258      }
1259    
1260    return rc;
1261  }  }
1262    
1263    
1264    
1265    
1266  /*************************************************  /*************************************************
1267  *      Find first significant op code            *  *      Find first significant op code            *
1268  *************************************************/  *************************************************/
# Line 1212  for (;;) Line 1464  for (;;)
1464      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1465      case OP_WORDCHAR:      case OP_WORDCHAR:
1466      case OP_ANY:      case OP_ANY:
1467        case OP_ALLANY:
1468      branchlength++;      branchlength++;
1469      cc++;      cc++;
1470      break;      break;
# Line 1360  for (;;) Line 1613  for (;;)
1613        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1614        break;        break;
1615        }        }
1616    #else
1617        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1618  #endif  #endif
1619      }      }
1620    }    }
# Line 1453  for (;;) Line 1708  for (;;)
1708        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1709        break;        break;
1710        }        }
1711    #else
1712        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1713  #endif  #endif
1714      }      }
1715    }    }
# Line 1468  for (;;) Line 1725  for (;;)
1725  can match the empty string or not. It is called from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1726  below and from compile_branch() when checking for an unlimited repeat of a  below and from compile_branch() when checking for an unlimited repeat of a
1727  group that can match nothing. Note that first_significant_code() skips over  group that can match nothing. Note that first_significant_code() skips over
1728  assertions. If we hit an unclosed bracket, we return "empty" - this means we've  backward and negative forward assertions when its final argument is TRUE. If we
1729  struck an inner bracket whose current branch will already have been scanned.  hit an unclosed bracket, we return "empty" - this means we've struck an inner
1730    bracket whose current branch will already have been scanned.
1731    
1732  Arguments:  Arguments:
1733    code        points to start of search    code        points to start of search
# Line 1491  for (code = first_significant_code(code Line 1749  for (code = first_significant_code(code
1749    
1750    c = *code;    c = *code;
1751    
1752      /* Skip over forward assertions; the other assertions are skipped by
1753      first_significant_code() with a TRUE final argument. */
1754    
1755      if (c == OP_ASSERT)
1756        {
1757        do code += GET(code, 1); while (*code == OP_ALT);
1758        c = *code;
1759        continue;
1760        }
1761    
1762    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1763    
1764    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1765      {      {
1766      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1767      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1508  for (code = first_significant_code(code Line 1776  for (code = first_significant_code(code
1776      BOOL empty_branch;      BOOL empty_branch;
1777      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1778    
1779      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1780        empty branch, so just skip over the conditional, because it could be empty.
1781        Otherwise, scan the individual branches of the group. */
1782    
1783      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1784        code += GET(code, 1);        code += GET(code, 1);
1785        else
1786          {
1787          empty_branch = FALSE;
1788          do
1789            {
1790            if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1791              empty_branch = TRUE;
1792            code += GET(code, 1);
1793            }
1794          while (*code == OP_ALT);
1795          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1796        }        }
1797      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
1798      c = *code;      c = *code;
1799      continue;      continue;
1800      }      }
# Line 1579  for (code = first_significant_code(code Line 1855  for (code = first_significant_code(code
1855      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1856      case OP_WORDCHAR:      case OP_WORDCHAR:
1857      case OP_ANY:      case OP_ANY:
1858        case OP_ALLANY:
1859      case OP_ANYBYTE:      case OP_ANYBYTE:
1860      case OP_CHAR:      case OP_CHAR:
1861      case OP_CHARNC:      case OP_CHARNC:
# Line 1597  for (code = first_significant_code(code Line 1874  for (code = first_significant_code(code
1874      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1875      return FALSE;      return FALSE;
1876    
1877        /* These are going to continue, as they may be empty, but we have to
1878        fudge the length for the \p and \P cases. */
1879    
1880        case OP_TYPESTAR:
1881        case OP_TYPEMINSTAR:
1882        case OP_TYPEPOSSTAR:
1883        case OP_TYPEQUERY:
1884        case OP_TYPEMINQUERY:
1885        case OP_TYPEPOSQUERY:
1886        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1887        break;
1888    
1889        /* Same for these */
1890    
1891        case OP_TYPEUPTO:
1892        case OP_TYPEMINUPTO:
1893        case OP_TYPEPOSUPTO:
1894        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1895        break;
1896    
1897      /* End of branch */      /* End of branch */
1898    
1899      case OP_KET:      case OP_KET:
# Line 1666  return TRUE; Line 1963  return TRUE;
1963  *************************************************/  *************************************************/
1964    
1965  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1966  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1967  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1968  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1969    
1970    Originally, this function only recognized a sequence of letters between the
1971    terminators, but it seems that Perl recognizes any sequence of characters,
1972    though of course unknown POSIX names are subsequently rejected. Perl gives an
1973    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1974    didn't consider this to be a POSIX class. Likewise for [:1234:].
1975    
1976    The problem in trying to be exactly like Perl is in the handling of escapes. We
1977    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1978    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1979    below handles the special case of \], but does not try to do any other escape
1980    processing. This makes it different from Perl for cases such as [:l\ower:]
1981    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1982    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1983    I think.
1984    
1985  Argument:  Arguments:
1986    ptr      pointer to the initial [    ptr      pointer to the initial [
1987    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1988    
1989  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1990  */  */
1991    
1992  static BOOL  static BOOL
1993  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1994  {  {
1995  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1996  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1997  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1998    {    {
1999    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2000    return TRUE;      {
2001        if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2002        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2003          {
2004          *endptr = ptr;
2005          return TRUE;
2006          }
2007        }
2008    }    }
2009  return FALSE;  return FALSE;
2010  }  }
# Line 1713  Returns: a value representing the na Line 2029  Returns: a value representing the na
2029  static int  static int
2030  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
2031  {  {
2032    const char *pn = posix_names;
2033  register int yield = 0;  register int yield = 0;
2034  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2035    {    {
2036    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2037      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
2038      pn += posix_name_lengths[yield] + 1;
2039    yield++;    yield++;
2040    }    }
2041  return -1;  return -1;
# Line 1732  return -1; Line 2050  return -1;
2050  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2051  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2052  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2053  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2054  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2055  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2056  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
2057  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2058    OP_END.
2059    
2060  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
2061  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1759  adjust_recurse(uschar *group, int adjust Line 2078  adjust_recurse(uschar *group, int adjust
2078    uschar *save_hwm)    uschar *save_hwm)
2079  {  {
2080  uschar *ptr = group;  uschar *ptr = group;
2081    
2082  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2083    {    {
2084    int offset;    int offset;
# Line 1870  get_othercase_range(unsigned int *cptr, Line 2190  get_othercase_range(unsigned int *cptr,
2190  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2191    
2192  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2193    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2194    
2195  if (c > d) return FALSE;  if (c > d) return FALSE;
2196    
# Line 1879  next = othercase + 1; Line 2199  next = othercase + 1;
2199    
2200  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2201    {    {
2202    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2203    next++;    next++;
2204    }    }
2205    
# Line 1925  if ((options & PCRE_EXTENDED) != 0) Line 2245  if ((options & PCRE_EXTENDED) != 0)
2245    for (;;)    for (;;)
2246      {      {
2247      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2248      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2249        {        {
2250        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2251          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
# Line 1937  if ((options & PCRE_EXTENDED) != 0) Line 2257  if ((options & PCRE_EXTENDED) != 0)
2257  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
2258  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
2259    
2260  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
2261    {    {
2262    int temperrorcode = 0;    int temperrorcode = 0;
2263    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
# Line 1962  if ((options & PCRE_EXTENDED) != 0) Line 2282  if ((options & PCRE_EXTENDED) != 0)
2282    for (;;)    for (;;)
2283      {      {
2284      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2285      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2286        {        {
2287        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2288          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
# Line 1973  if ((options & PCRE_EXTENDED) != 0) Line 2293  if ((options & PCRE_EXTENDED) != 0)
2293    
2294  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2295    
2296  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2297    return FALSE;    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2298        return FALSE;
2299    
2300  /* Now compare the next item with the previous opcode. If the previous is a  /* Now compare the next item with the previous opcode. If the previous is a
2301  positive single character match, "item" either contains the character or, if  positive single character match, "item" either contains the character or, if
# Line 1989  if (next >= 0) switch(op_code) Line 2310  if (next >= 0) switch(op_code)
2310    case OP_CHAR:    case OP_CHAR:
2311  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2312    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2313    #else
2314      (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2315  #endif  #endif
2316    return item != next;    return item != next;
2317    
# Line 2007  if (next >= 0) switch(op_code) Line 2330  if (next >= 0) switch(op_code)
2330      unsigned int othercase;      unsigned int othercase;
2331      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2332  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2333      othercase = _pcre_ucp_othercase((unsigned int)next);      othercase = UCD_OTHERCASE((unsigned int)next);
2334  #else  #else
2335      othercase = NOTACHAR;      othercase = NOTACHAR;
2336  #endif  #endif
# Line 2020  if (next >= 0) switch(op_code) Line 2343  if (next >= 0) switch(op_code)
2343    /* For OP_NOT, "item" must be a single-byte character. */    /* For OP_NOT, "item" must be a single-byte character. */
2344    
2345    case OP_NOT:    case OP_NOT:
   if (next < 0) return FALSE;  /* Not a character */  
2346    if (item == next) return TRUE;    if (item == next) return TRUE;
2347    if ((options & PCRE_CASELESS) == 0) return FALSE;    if ((options & PCRE_CASELESS) == 0) return FALSE;
2348  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 2029  if (next >= 0) switch(op_code) Line 2351  if (next >= 0) switch(op_code)
2351      unsigned int othercase;      unsigned int othercase;
2352      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2353  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2354      othercase = _pcre_ucp_othercase(next);      othercase = UCD_OTHERCASE(next);
2355  #else  #else
2356      othercase = NOTACHAR;      othercase = NOTACHAR;
2357  #endif  #endif
# Line 2283  uschar classbits[32]; Line 2605  uschar classbits[32];
2605  BOOL class_utf8;  BOOL class_utf8;
2606  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2607  uschar *class_utf8data;  uschar *class_utf8data;
2608    uschar *class_utf8data_base;
2609  uschar utf8_char[6];  uschar utf8_char[6];
2610  #else  #else
2611  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
# Line 2322  req_caseopt = ((options & PCRE_CASELESS) Line 2645  req_caseopt = ((options & PCRE_CASELESS)
2645  for (;; ptr++)  for (;; ptr++)
2646    {    {
2647    BOOL negate_class;    BOOL negate_class;
2648      BOOL should_flip_negation;
2649    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2650    BOOL is_quantifier;    BOOL is_quantifier;
2651    BOOL is_recurse;    BOOL is_recurse;
# Line 2409  for (;; ptr++) Line 2733  for (;; ptr++)
2733    
2734    if (inescq && c != 0)    if (inescq && c != 0)
2735      {      {
2736      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2737        {        {
2738        inescq = FALSE;        inescq = FALSE;
2739        ptr++;        ptr++;
# Line 2435  for (;; ptr++) Line 2759  for (;; ptr++)
2759    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2760    a quantifier. */    a quantifier. */
2761    
2762    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
2763      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2764        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2765    
2766    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2767         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2451  for (;; ptr++) Line 2776  for (;; ptr++)
2776    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2777      {      {
2778      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2779      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
2780        {        {
2781        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2782          {          {
# Line 2476  for (;; ptr++) Line 2801  for (;; ptr++)
2801      {      {
2802      /* ===================================================================*/      /* ===================================================================*/
2803      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
2804      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
2805      case ')':      case CHAR_RIGHT_PARENTHESIS:
2806      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2807      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2808      *codeptr = code;      *codeptr = code;
# Line 2499  for (;; ptr++) Line 2824  for (;; ptr++)
2824      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2825      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2826    
2827      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
2828      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
2829        {        {
2830        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
# Line 2508  for (;; ptr++) Line 2833  for (;; ptr++)
2833      *code++ = OP_CIRC;      *code++ = OP_CIRC;
2834      break;      break;
2835    
2836      case '$':      case CHAR_DOLLAR_SIGN:
2837      previous = NULL;      previous = NULL;
2838      *code++ = OP_DOLL;      *code++ = OP_DOLL;
2839      break;      break;
# Line 2516  for (;; ptr++) Line 2841  for (;; ptr++)
2841      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
2842      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
2843    
2844      case '.':      case CHAR_DOT:
2845      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2847      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2848      previous = code;      previous = code;
2849      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2850      break;      break;
2851    
2852    
# Line 2536  for (;; ptr++) Line 2861  for (;; ptr++)
2861      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2862      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2863      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
2864    
2865      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
2866        default (Perl) mode, it is treated as a data character. */
2867    
2868        case CHAR_RIGHT_SQUARE_BRACKET:
2869        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2870          {
2871          *errorcodeptr = ERR64;
2872          goto FAILED;
2873          }
2874        goto NORMAL_CHAR;
2875    
2876        case CHAR_LEFT_SQUARE_BRACKET:
2877      previous = code;      previous = code;
2878    
2879      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2880      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2881    
2882      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2883          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
2884            check_posix_syntax(ptr, &tempptr))
2885        {        {
2886        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2887        goto FAILED;        goto FAILED;
2888        }        }
2889    
# Line 2559  for (;; ptr++) Line 2895  for (;; ptr++)
2895      for (;;)      for (;;)
2896        {        {
2897        c = *(++ptr);        c = *(++ptr);
2898        if (c == '\\')        if (c == CHAR_BACKSLASH)
2899          {          {
2900          if (ptr[1] == 'E') ptr++;          if (ptr[1] == CHAR_E)
2901            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr++;
2902              else break;          else if (strncmp((const char *)ptr+1,
2903                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
2904              ptr += 3;
2905            else
2906              break;
2907          }          }
2908        else if (!negate_class && c == '^')        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2909          negate_class = TRUE;          negate_class = TRUE;
2910        else break;        else break;
2911        }        }
2912    
2913        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2914        an initial ']' is taken as a data character -- the code below handles
2915        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2916        [^] must match any character, so generate OP_ALLANY. */
2917    
2918        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2919            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2920          {
2921          *code++ = negate_class? OP_ALLANY : OP_FAIL;
2922          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2923          zerofirstbyte = firstbyte;
2924          break;
2925          }
2926    
2927        /* If a class contains a negative special such as \S, we need to flip the
2928        negation flag at the end, so that support for characters > 255 works
2929        correctly (they are all included in the class). */
2930    
2931        should_flip_negation = FALSE;
2932    
2933      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2934      of just a single character (as long as it's < 256). However, For higher      of just a single character (as long as it's < 256). However, For higher
2935      valued UTF-8 characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
# Line 2587  for (;; ptr++) Line 2947  for (;; ptr++)
2947  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2948      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2949      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2950        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2951  #endif  #endif
2952    
2953      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
# Line 2602  for (;; ptr++) Line 2963  for (;; ptr++)
2963          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2964          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2965          }          }
2966    
2967          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2968          data and reset the pointer. This is so that very large classes that
2969          contain a zillion UTF-8 characters no longer overwrite the work space
2970          (which is on the stack). */
2971    
2972          if (lengthptr != NULL)
2973            {
2974            *lengthptr += class_utf8data - class_utf8data_base;
2975            class_utf8data = class_utf8data_base;
2976            }
2977    
2978  #endif  #endif
2979    
2980        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2981    
2982        if (inescq)        if (inescq)
2983          {          {
2984          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
2985            {            {
2986            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
2987            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2623  for (;; ptr++) Line 2996  for (;; ptr++)
2996        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2997        5.6 and 5.8 do. */        5.6 and 5.8 do. */
2998    
2999        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3000            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3001            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3002          {          {
3003          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3004          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3005          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
3006          uschar pbits[32];          uschar pbits[32];
3007    
3008          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3009            {            {
3010            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3011            goto FAILED;            goto FAILED;
3012            }            }
3013    
3014          ptr += 2;          ptr += 2;
3015          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3016            {            {
3017            local_negate = TRUE;            local_negate = TRUE;
3018              should_flip_negation = TRUE;  /* Note negative special */
3019            ptr++;            ptr++;
3020            }            }
3021    
# Line 2711  for (;; ptr++) Line 3085  for (;; ptr++)
3085        to 'or' into the one we are building. We assume they have more than one        to 'or' into the one we are building. We assume they have more than one
3086        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
3087    
3088        if (c == '\\')        if (c == CHAR_BACKSLASH)
3089          {          {
3090          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3091          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3092    
3093          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3094          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3095          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */          else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3096          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3097            {            {
3098            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3099              {              {
3100              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3101              }              }
# Line 2744  for (;; ptr++) Line 3118  for (;; ptr++)
3118              continue;              continue;
3119    
3120              case ESC_D:              case ESC_D:
3121                should_flip_negation = TRUE;
3122              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3123              continue;              continue;
3124    
# Line 2752  for (;; ptr++) Line 3127  for (;; ptr++)
3127              continue;              continue;
3128    
3129              case ESC_W:              case ESC_W:
3130                should_flip_negation = TRUE;
3131              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3132              continue;              continue;
3133    
# Line 2761  for (;; ptr++) Line 3137  for (;; ptr++)
3137              continue;              continue;
3138    
3139              case ESC_S:              case ESC_S:
3140                should_flip_negation = TRUE;
3141              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3142              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3143              continue;              continue;
3144    
             case ESC_E: /* Perl ignores an orphan \E */  
             continue;  
   
3145              default:    /* Not recognized; fall through */              default:    /* Not recognized; fall through */
3146              break;      /* Need "default" setting to stop compiler warning. */              break;      /* Need "default" setting to stop compiler warning. */
3147              }              }
# Line 2947  for (;; ptr++) Line 3321  for (;; ptr++)
3321        entirely. The code for handling \Q and \E is messy. */        entirely. The code for handling \Q and \E is messy. */
3322    
3323        CHECK_RANGE:        CHECK_RANGE:
3324        while (ptr[1] == '\\' && ptr[2] == 'E')        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3325          {          {
3326          inescq = FALSE;          inescq = FALSE;
3327          ptr += 2;          ptr += 2;
# Line 2955  for (;; ptr++) Line 3329  for (;; ptr++)
3329    
3330        oldptr = ptr;        oldptr = ptr;
3331    
3332        if (!inescq && ptr[1] == '-')        /* Remember \r or \n */
3333    
3334          if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3335    
3336          /* Check for range */
3337    
3338          if (!inescq && ptr[1] == CHAR_MINUS)
3339          {          {
3340          int d;          int d;
3341          ptr += 2;          ptr += 2;
3342          while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3343    
3344          /* If we hit \Q (not followed by \E) at this point, go into escaped          /* If we hit \Q (not followed by \E) at this point, go into escaped
3345          mode. */          mode. */
3346    
3347          while (*ptr == '\\' && ptr[1] == 'Q')          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3348            {            {
3349            ptr += 2;            ptr += 2;
3350            if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3351                { ptr += 2; continue; }
3352            inescq = TRUE;            inescq = TRUE;
3353            break;            break;
3354            }            }
3355    
3356          if (*ptr == 0 || (!inescq && *ptr == ']'))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3357            {            {
3358            ptr = oldptr;            ptr = oldptr;
3359            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
# Line 2991  for (;; ptr++) Line 3372  for (;; ptr++)
3372          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3373          in such circumstances. */          in such circumstances. */
3374    
3375          if (!inescq && d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
3376            {            {
3377            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3378            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
3379    
3380            /* \b is backslash; \X is literal X; \R is literal R; any other            /* \b is backspace; \X is literal X; \R is literal R; any other
3381            special means the '-' was literal */            special means the '-' was literal */
3382    
3383            if (d < 0)            if (d < 0)
3384              {              {
3385              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS;
3386              else if (d == -ESC_X) d = 'X';              else if (d == -ESC_X) d = CHAR_X;
3387              else if (d == -ESC_R) d = 'R'; else              else if (d == -ESC_R) d = CHAR_R; else
3388                {                {
3389                ptr = oldptr;                ptr = oldptr;
3390                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 3022  for (;; ptr++) Line 3403  for (;; ptr++)
3403    
3404          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3405    
3406            /* Remember \r or \n */
3407    
3408            if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3409    
3410          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3411          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3412          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 3140  for (;; ptr++) Line 3525  for (;; ptr++)
3525          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3526            {            {
3527            unsigned int othercase;            unsigned int othercase;
3528            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)            if ((othercase = UCD_OTHERCASE(c)) != c)
3529              {              {
3530              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3531              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 3167  for (;; ptr++) Line 3552  for (;; ptr++)
3552    
3553      /* Loop until ']' reached. This "while" is the end of the "do" above. */      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3554    
3555      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));      while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3556    
3557      if (c == 0)                          /* Missing terminating ']' */      if (c == 0)                          /* Missing terminating ']' */
3558        {        {
# Line 3175  for (;; ptr++) Line 3560  for (;; ptr++)
3560        goto FAILED;        goto FAILED;
3561        }        }
3562    
3563    
3564    /* This code has been disabled because it would mean that \s counts as
3565    an explicit \r or \n reference, and that's not really what is wanted. Now
3566    we set the flag only if there is a literal "\r" or "\n" in the class. */
3567    
3568    #if 0
3569        /* Remember whether \r or \n are in this class */
3570    
3571        if (negate_class)
3572          {
3573          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3574          }
3575        else
3576          {
3577          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3578          }
3579    #endif
3580    
3581    
3582      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3583      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3584      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3585      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3586      single-bytes only. This is an historical hangover. Maybe one day we can  
3587      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3588        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3589        operate on single-bytes only. This is an historical hangover. Maybe one day
3590        we can tidy these opcodes to handle multi-byte characters.
3591    
3592      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3593      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 3190  for (;; ptr++) Line 3597  for (;; ptr++)
3597      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3598    
3599  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3600      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3601            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3602  #else  #else
3603      if (class_charcount == 1)      if (class_charcount == 1)
3604  #endif  #endif
# Line 3236  for (;; ptr++) Line 3641  for (;; ptr++)
3641      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3642    
3643      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3644      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3645      we can omit the bitmap in the actual compiled code. */      such as \S in the class, because in that case all characters > 255 are in
3646        the class, so any that were explicitly given as well can be ignored. If
3647        (when there are explicit characters > 255 that must be listed) there are no
3648        characters < 256, we can omit the bitmap in the actual compiled code. */
3649    
3650  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3651      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3652        {        {
3653        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3654        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
# Line 3266  for (;; ptr++) Line 3674  for (;; ptr++)
3674        }        }
3675  #endif  #endif
3676    
3677      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3678      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3679      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3680      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3681    
3682        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3683      if (negate_class)      if (negate_class)
3684        {        {
       *code++ = OP_NCLASS;  
3685        if (lengthptr == NULL)    /* Save time in the pre-compile phase */        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3686          for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3687        }        }
3688      else      else
3689        {        {
       *code++ = OP_CLASS;  
3690        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3691        }        }
3692      code += 32;      code += 32;
# Line 3290  for (;; ptr++) Line 3697  for (;; ptr++)
3697      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3698      has been tested above. */      has been tested above. */
3699    
3700      case '{':      case CHAR_LEFT_CURLY_BRACKET:
3701      if (!is_quantifier) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
3702      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3703      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
3704      goto REPEAT;      goto REPEAT;
3705    
3706      case '*':      case CHAR_ASTERISK:
3707      repeat_min = 0;      repeat_min = 0;
3708      repeat_max = -1;      repeat_max = -1;
3709      goto REPEAT;      goto REPEAT;
3710    
3711      case '+':      case CHAR_PLUS:
3712      repeat_min = 1;      repeat_min = 1;
3713      repeat_max = -1;      repeat_max = -1;
3714      goto REPEAT;      goto REPEAT;
3715    
3716      case '?':      case CHAR_QUESTION_MARK:
3717      repeat_min = 0;      repeat_min = 0;
3718      repeat_max = 1;      repeat_max = 1;
3719    
# Line 3341  for (;; ptr++) Line 3748  for (;; ptr++)
3748      but if PCRE_UNGREEDY is set, it works the other way round. We change the      but if PCRE_UNGREEDY is set, it works the other way round. We change the
3749      repeat type to the non-default. */      repeat type to the non-default. */
3750    
3751      if (ptr[1] == '+')      if (ptr[1] == CHAR_PLUS)
3752        {        {
3753        repeat_type = 0;                  /* Force greedy */        repeat_type = 0;                  /* Force greedy */
3754        possessive_quantifier = TRUE;        possessive_quantifier = TRUE;
3755        ptr++;        ptr++;
3756        }        }
3757      else if (ptr[1] == '?')      else if (ptr[1] == CHAR_QUESTION_MARK)
3758        {        {
3759        repeat_type = greedy_non_default;        repeat_type = greedy_non_default;
3760        ptr++;        ptr++;
# Line 3465  for (;; ptr++) Line 3872  for (;; ptr++)
3872        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3873        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3874    
3875        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3876    
3877        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3878    
# Line 3615  for (;; ptr++) Line 4022  for (;; ptr++)
4022        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4023        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4024    
4025        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
4026    
4027        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4028          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 3673  for (;; ptr++) Line 4080  for (;; ptr++)
4080    
4081        if (repeat_min == 0)        if (repeat_min == 0)
4082          {          {
4083          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
4084          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
4085    
4086          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
4087          BRAZERO and do no more at this point. However, we do need to adjust          **   {
4088          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
4089          any internal or forward referenced group, because the offset is from          **   goto END_REPEAT;
4090          the start of the whole regex. Temporarily terminate the pattern while          **   }
4091          doing this. */  
4092            However, that fails when a group is referenced as a subroutine from
4093            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4094            so that it is skipped on execution. As we don't have a list of which
4095            groups are referenced, we cannot do this selectively.
4096    
4097            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4098            and do no more at this point. However, we do need to adjust any
4099            OP_RECURSE calls inside the group that refer to the group itself or any
4100            internal or forward referenced group, because the offset is from the
4101            start of the whole regex. Temporarily terminate the pattern while doing
4102            this. */
4103    
4104          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4105            {            {
4106            *code = OP_END;            *code = OP_END;
4107            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
4108            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
4109            code++;            code++;
4110              if (repeat_max == 0)
4111                {
4112                *previous++ = OP_SKIPZERO;
4113                goto END_REPEAT;
4114                }
4115            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4116            }            }
4117    
# Line 3889  for (;; ptr++) Line 4306  for (;; ptr++)
4306          }          }
4307        }        }
4308    
4309        /* If previous is OP_FAIL, it was generated by an empty class [] in
4310        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4311        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4312        error above. We can just ignore the repeat in JS case. */
4313    
4314        else if (*previous == OP_FAIL) goto END_REPEAT;
4315    
4316      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4317    
4318      else      else
# Line 3915  for (;; ptr++) Line 4339  for (;; ptr++)
4339        int len;        int len;
4340        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4341            *tempcode == OP_NOTEXACT)            *tempcode == OP_NOTEXACT)
4342          tempcode += _pcre_OP_lengths[*tempcode];          tempcode += _pcre_OP_lengths[*tempcode] +
4343              ((*tempcode == OP_TYPEEXACT &&
4344                 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4345        len = code - tempcode;        len = code - tempcode;
4346        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4347          {          {
# Line 3961  for (;; ptr++) Line 4387  for (;; ptr++)
4387      lookbehind or option setting or condition or all the other extended      lookbehind or option setting or condition or all the other extended
4388      parenthesis forms.  */      parenthesis forms.  */
4389    
4390      case '(':      case CHAR_LEFT_PARENTHESIS:
4391      newoptions = options;      newoptions = options;
4392      skipbytes = 0;      skipbytes = 0;
4393      bravalue = OP_CBRA;      bravalue = OP_CBRA;
# Line 3970  for (;; ptr++) Line 4396  for (;; ptr++)
4396    
4397      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4398    
4399      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4400        {        {
4401        int i, namelen;        int i, namelen;
4402          const char *vn = verbnames;
4403        const uschar *name = ++ptr;        const uschar *name = ++ptr;
4404        previous = NULL;        previous = NULL;
4405        while ((cd->ctypes[*++ptr] & ctype_letter) != 0);        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4406        if (*ptr == ':')        if (*ptr == CHAR_COLON)
4407          {          {
4408          *errorcodeptr = ERR59;   /* Not supported */          *errorcodeptr = ERR59;   /* Not supported */
4409          goto FAILED;          goto FAILED;
4410          }          }
4411        if (*ptr != ')')        if (*ptr != CHAR_RIGHT_PARENTHESIS)
4412          {          {
4413          *errorcodeptr = ERR60;          *errorcodeptr = ERR60;
4414          goto FAILED;          goto FAILED;
# Line 3990  for (;; ptr++) Line 4417  for (;; ptr++)
4417        for (i = 0; i < verbcount; i++)        for (i = 0; i < verbcount; i++)
4418          {          {
4419          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4420              strncmp((char *)name, verbs[i].name, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4421            {            {
4422            *code = verbs[i].op;            *code = verbs[i].op;
4423            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4424            break;            break;
4425            }            }
4426            vn += verbs[i].len + 1;
4427          }          }
4428        if (i < verbcount) continue;        if (i < verbcount) continue;
4429        *errorcodeptr = ERR60;        *errorcodeptr = ERR60;
# Line 4005  for (;; ptr++) Line 4433  for (;; ptr++)
4433      /* Deal with the extended parentheses; all are introduced by '?', and the      /* Deal with the extended parentheses; all are introduced by '?', and the
4434      appearance of any of them means that this is not a capturing group. */      appearance of any of them means that this is not a capturing group. */
4435    
4436      else if (*ptr == '?')      else if (*ptr == CHAR_QUESTION_MARK)
4437        {        {
4438        int i, set, unset, namelen;        int i, set, unset, namelen;
4439        int *optset;        int *optset;
# Line 4014  for (;; ptr++) Line 4442  for (;; ptr++)
4442    
4443        switch (*(++ptr))        switch (*(++ptr))
4444          {          {
4445          case '#':                 /* Comment; skip to ket */          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4446          ptr++;          ptr++;
4447          while (*ptr != 0 && *ptr != ')') ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4448          if (*ptr == 0)          if (*ptr == 0)
4449            {            {
4450            *errorcodeptr = ERR18;            *errorcodeptr = ERR18;
# Line 4026  for (;; ptr++) Line 4454  for (;; ptr++)
4454    
4455    
4456          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4457          case '|':                 /* Reset capture count for each branch */          case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4458          reset_bracount = TRUE;          reset_bracount = TRUE;
4459          /* Fall through */          /* Fall through */
4460    
4461          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4462          case ':':                 /* Non-capturing bracket */          case CHAR_COLON:          /* Non-capturing bracket */
4463          bravalue = OP_BRA;          bravalue = OP_BRA;
4464          ptr++;          ptr++;
4465          break;          break;
4466    
4467    
4468          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4469          case '(':          case CHAR_LEFT_PARENTHESIS:
4470          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4471    
4472          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
# Line 4058  for (;; ptr++) Line 4486  for (;; ptr++)
4486          the switch. This will take control down to where bracketed groups,          the switch. This will take control down to where bracketed groups,
4487          including assertions, are processed. */          including assertions, are processed. */
4488    
4489          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))          if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4490                ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4491            break;            break;
4492    
4493          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
# Line 4070  for (;; ptr++) Line 4499  for (;; ptr++)
4499    
4500          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
4501    
4502          if (ptr[1] == 'R' && ptr[2] == '&')          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4503            {            {
4504            terminator = -1;            terminator = -1;
4505            ptr += 2;            ptr += 2;
# Line 4080  for (;; ptr++) Line 4509  for (;; ptr++)
4509          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
4510          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name') */
4511    
4512          else if (ptr[1] == '<')          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4513            {            {
4514            terminator = '>';            terminator = CHAR_GREATER_THAN_SIGN;
4515            ptr++;            ptr++;
4516            }            }
4517          else if (ptr[1] == '\'')          else if (ptr[1] == CHAR_APOSTROPHE)
4518            {            {
4519            terminator = '\'';            terminator = CHAR_APOSTROPHE;
4520            ptr++;            ptr++;
4521            }            }
4522          else          else
4523            {            {
4524            terminator = 0;            terminator = 0;
4525            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4526            }            }
4527    
4528          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
# Line 4113  for (;; ptr++) Line 4542  for (;; ptr++)
4542            {            {
4543            if (recno >= 0)            if (recno >= 0)
4544              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = ((digitab[*ptr] & ctype_digit) != 0)?
4545                recno * 10 + *ptr - '0' : -1;                recno * 10 + *ptr - CHAR_0 : -1;
4546            ptr++;            ptr++;
4547            }            }
4548          namelen = ptr - name;          namelen = ptr - name;
4549    
4550          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')          if ((terminator > 0 && *ptr++ != terminator) ||
4551                *ptr++ != CHAR_RIGHT_PARENTHESIS)
4552            {            {
4553            ptr--;      /* Error offset */            ptr--;      /* Error offset */
4554            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;
# Line 4140  for (;; ptr++) Line 4570  for (;; ptr++)
4570              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4571              goto FAILED;              goto FAILED;
4572              }              }
4573            if (refsign == '-')            recno = (refsign == CHAR_MINUS)?
4574                cd->bracount - recno + 1 : recno +cd->bracount;
4575              if (recno <= 0 || recno > cd->final_bracount)
4576              {              {
4577              recno = cd->bracount - recno + 1;              *errorcodeptr = ERR15;
4578              if (recno <= 0)              goto FAILED;
               {  
               *errorcodeptr = ERR15;  
               goto FAILED;  
               }  
4579              }              }
           else recno += cd->bracount;  
4580            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4581            break;            break;
4582            }            }
# Line 4174  for (;; ptr++) Line 4601  for (;; ptr++)
4601    
4602          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4603    
4604          else if ((i = find_parens(ptr, cd->bracount, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4605                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4606            {            {
4607            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
# Line 4195  for (;; ptr++) Line 4622  for (;; ptr++)
4622          /* Check for (?(R) for recursion. Allow digits after R to specify a          /* Check for (?(R) for recursion. Allow digits after R to specify a
4623          specific group number. */          specific group number. */
4624    
4625          else if (*name == 'R')          else if (*name == CHAR_R)
4626            {            {
4627            recno = 0;            recno = 0;
4628            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
# Line 4205  for (;; ptr++) Line 4632  for (;; ptr++)
4632                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
4633                goto FAILED;                goto FAILED;
4634                }                }
4635              recno = recno * 10 + name[i] - '0';              recno = recno * 10 + name[i] - CHAR_0;
4636              }              }
4637            if (recno == 0) recno = RREF_ANY;            if (recno == 0) recno = RREF_ANY;
4638            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
# Line 4215  for (;; ptr++) Line 4642  for (;; ptr++)
4642          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
4643          false. */          false. */
4644    
4645          else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4646            {            {
4647            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
4648            skipbytes = 1;            skipbytes = 1;
4649            }            }
4650    
4651          /* Check for the "name" actually being a subpattern number. */          /* Check for the "name" actually being a subpattern number. We are
4652            in the second pass here, so final_bracount is set. */
4653    
4654          else if (recno > 0)          else if (recno > 0 && recno <= cd->final_bracount)
4655            {            {
4656            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4657            }            }
# Line 4239  for (;; ptr++) Line 4667  for (;; ptr++)
4667    
4668    
4669          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4670          case '=':                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
4671          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
4672          ptr++;          ptr++;
4673          break;          break;
4674    
4675    
4676          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4677          case '!':                 /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
4678          ptr++;          ptr++;
4679          if (*ptr == ')')          /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
4680            {            {
4681            *code++ = OP_FAIL;            *code++ = OP_FAIL;
4682            previous = NULL;            previous = NULL;
# Line 4259  for (;; ptr++) Line 4687  for (;; ptr++)
4687    
4688    
4689          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4690          case '<':                 /* Lookbehind or named define */          case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
4691          switch (ptr[1])          switch (ptr[1])
4692            {            {
4693            case '=':               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
4694            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
4695            ptr += 2;            ptr += 2;
4696            break;            break;
4697    
4698            case '!':               /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
4699            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
4700            ptr += 2;            ptr += 2;
4701            break;            break;
# Line 4282  for (;; ptr++) Line 4710  for (;; ptr++)
4710    
4711    
4712          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4713          case '>':                 /* One-time brackets */          case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
4714          bravalue = OP_ONCE;          bravalue = OP_ONCE;
4715          ptr++;          ptr++;
4716          break;          break;
4717    
4718    
4719          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4720          case 'C':                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
4721          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4722          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4723          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
4724            {            {
4725            int n = 0;            int n = 0;
4726            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4727              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - CHAR_0;
4728            if (*ptr != ')')            if (*ptr != CHAR_RIGHT_PARENTHESIS)
4729              {              {
4730              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
4731              goto FAILED;              goto FAILED;
# Line 4317  for (;; ptr++) Line 4745  for (;; ptr++)
4745    
4746    
4747          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4748          case 'P':                 /* Python-style named subpattern handling */          case CHAR_P:              /* Python-style named subpattern handling */
4749          if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */          if (*(++ptr) == CHAR_EQUALS_SIGN ||
4750                *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4751            {            {
4752            is_recurse = *ptr == '>';            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4753            terminator = ')';            terminator = CHAR_RIGHT_PARENTHESIS;
4754            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
4755            }            }
4756          else if (*ptr != '<')    /* Test for Python-style definition */          else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
4757            {            {
4758            *errorcodeptr = ERR41;            *errorcodeptr = ERR41;
4759            goto FAILED;            goto FAILED;
# Line 4334  for (;; ptr++) Line 4763  for (;; ptr++)
4763    
4764          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4765          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
4766          case '\'':          case CHAR_APOSTROPHE:
4767            {            {
4768            terminator = (*ptr == '<')? '>' : '\'';            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4769                CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4770            name = ++ptr;            name = ++ptr;
4771    
4772            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
# Line 4410  for (;; ptr++) Line 4840  for (;; ptr++)
4840    
4841    
4842          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4843          case '&':                 /* Perl recursion/subroutine syntax */          case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
4844          terminator = ')';          terminator = CHAR_RIGHT_PARENTHESIS;
4845          is_recurse = TRUE;          is_recurse = TRUE;
4846          /* Fall through */          /* Fall through */
4847    
4848          /* We come here from the Python syntax above that handles both          /* We come here from the Python syntax above that handles both
4849          references (?P=name) and recursion (?P>name), as well as falling          references (?P=name) and recursion (?P>name), as well as falling
4850          through from the Perl recursion syntax (?&name). */          through from the Perl recursion syntax (?&name). We also come here from
4851            the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4852            .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4853    
4854          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
4855          name = ++ptr;          name = ++ptr;
# Line 4429  for (;; ptr++) Line 4861  for (;; ptr++)
4861    
4862          if (lengthptr != NULL)          if (lengthptr != NULL)
4863            {            {
4864              if (namelen == 0)
4865                {
4866                *errorcodeptr = ERR62;
4867                goto FAILED;
4868                }
4869            if (*ptr != terminator)            if (*ptr != terminator)
4870              {              {
4871              *errorcodeptr = ERR42;              *errorcodeptr = ERR42;
# Line 4442  for (;; ptr++) Line 4879  for (;; ptr++)
4879            recno = 0;            recno = 0;
4880            }            }
4881    
4882          /* In the real compile, seek the name in the table */          /* In the real compile, seek the name in the table. We check the name
4883            first, and then check that we have reached the end of the name in the
4884            table. That way, if the name that is longer than any in the table,
4885            the comparison will fail without reading beyond the table entry. */
4886    
4887          else          else
4888            {            {
4889            slot = cd->name_table;            slot = cd->name_table;
4890            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4891              {              {
4892              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4893                    slot[2+namelen] == 0)
4894                  break;
4895              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4896              }              }
4897    
# Line 4458  for (;; ptr++) Line 4900  for (;; ptr++)
4900              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4901              }              }
4902            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4903                      find_parens(ptr, cd->bracount, name, namelen,                      find_parens(cd, name, namelen,
4904                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
4905              {              {
4906              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4474  for (;; ptr++) Line 4916  for (;; ptr++)
4916    
4917    
4918          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4919          case 'R':                 /* Recursion */          case CHAR_R:              /* Recursion */
4920          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4921          /* Fall through */          /* Fall through */
4922    
4923    
4924          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4925          case '-': case '+':          case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
4926          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4927          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4928            {            {
4929            const uschar *called;            const uschar *called;
4930              terminator = CHAR_RIGHT_PARENTHESIS;
4931    
4932              /* Come here from the \g<...> and \g'...' code (Oniguruma
4933              compatibility). However, the syntax has been checked to ensure that
4934              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4935              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4936              ever be taken. */
4937    
4938              HANDLE_NUMERICAL_RECURSION:
4939    
4940            if ((refsign = *ptr) == '+') ptr++;            if ((refsign = *ptr) == CHAR_PLUS)
4941            else if (refsign == '-')              {
4942                ptr++;
4943                if ((digitab[*ptr] & ctype_digit) == 0)
4944                  {
4945                  *errorcodeptr = ERR63;
4946                  goto FAILED;
4947                  }
4948                }
4949              else if (refsign == CHAR_MINUS)
4950              {              {
4951              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
4952                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
# Line 4496  for (;; ptr++) Line 4955  for (;; ptr++)
4955    
4956            recno = 0;            recno = 0;
4957            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4958              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - CHAR_0;
4959    
4960            if (*ptr != ')')            if (*ptr != terminator)
4961              {              {
4962              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4963              goto FAILED;              goto FAILED;
4964              }              }
4965    
4966            if (refsign == '-')            if (refsign == CHAR_MINUS)
4967              {              {
4968              if (recno == 0)              if (recno == 0)
4969                {                {
# Line 4518  for (;; ptr++) Line 4977  for (;; ptr++)
4977                goto FAILED;                goto FAILED;
4978                }                }
4979              }              }
4980            else if (refsign == '+')            else if (refsign == CHAR_PLUS)
4981              {              {
4982              if (recno == 0)              if (recno == 0)
4983                {                {
# Line 4551  for (;; ptr++) Line 5010  for (;; ptr++)
5010    
5011              if (called == NULL)              if (called == NULL)
5012                {                {
5013                if (find_parens(ptr, cd->bracount, NULL, recno,                if (find_parens(cd, NULL, recno,
5014                     (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5015                  {                  {
5016                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5017                  goto FAILED;                  goto FAILED;
# Line 4604  for (;; ptr++) Line 5063  for (;; ptr++)
5063          set = unset = 0;          set = unset = 0;
5064          optset = &set;          optset = &set;
5065    
5066          while (*ptr != ')' && *ptr != ':')          while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5067            {            {
5068            switch (*ptr++)            switch (*ptr++)
5069              {              {
5070              case '-': optset = &unset; break;              case CHAR_MINUS: optset = &unset; break;
5071    
5072              case 'J':    /* Record that it changed in the external options */              case CHAR_J:    /* Record that it changed in the external options */
5073              *optset |= PCRE_DUPNAMES;              *optset |= PCRE_DUPNAMES;
5074              cd->external_options |= PCRE_JCHANGED;              cd->external_flags |= PCRE_JCHANGED;
5075              break;              break;
5076    
5077              case 'i': *optset |= PCRE_CASELESS; break;              case CHAR_i: *optset |= PCRE_CASELESS; break;
5078              case 'm': *optset |= PCRE_MULTILINE; break;              case CHAR_m: *optset |= PCRE_MULTILINE; break;
5079              case 's': *optset |= PCRE_DOTALL; break;              case CHAR_s: *optset |= PCRE_DOTALL; break;
5080              case 'x': *optset |= PCRE_EXTENDED; break;              case CHAR_x: *optset |= PCRE_EXTENDED; break;
5081              case 'U': *optset |= PCRE_UNGREEDY; break;              case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5082              case 'X': *optset |= PCRE_EXTRA; break;              case CHAR_X: *optset |= PCRE_EXTRA; break;
5083    
5084              default:  *errorcodeptr = ERR12;              default:  *errorcodeptr = ERR12;
5085                        ptr--;    /* Correct the offset */                        ptr--;    /* Correct the offset */
# Line 4651  for (;; ptr++) Line 5110  for (;; ptr++)
5110          both phases.          both phases.
5111    
5112          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, compile code to change the ims
5113          options if this setting actually changes any of them. We also pass the          options if this setting actually changes any of them, and reset the
5114          new setting back so that it can be put at the start of any following          greedy defaults and the case value for firstbyte and reqbyte. */
         branches, and when this group ends (if we are in a group), a resetting  
         item can be compiled. */  
5115    
5116          if (*ptr == ')')          if (*ptr == CHAR_RIGHT_PARENTHESIS)
5117            {            {
5118            if (code == cd->start_code + 1 + LINK_SIZE &&            if (code == cd->start_code + 1 + LINK_SIZE &&
5119                 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))                 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5120              {              {
5121              cd->external_options = newoptions;              cd->external_options = newoptions;
             options = newoptions;  
5122              }              }
5123           else           else
5124              {              {
# Line 4671  for (;; ptr++) Line 5127  for (;; ptr++)
5127                *code++ = OP_OPT;                *code++ = OP_OPT;
5128                *code++ = newoptions & PCRE_IMS;                *code++ = newoptions & PCRE_IMS;
5129                }                }
   
             /* Change options at this level, and pass them back for use  
             in subsequent branches. Reset the greedy defaults and the case  
             value for firstbyte and reqbyte. */  
   
             *optionsptr = options = newoptions;  
5130              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5131              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
5132              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5133              }              }
5134    
5135              /* Change options at this level, and pass them back for use
5136              in subsequent branches. When not at the start of the pattern, this
5137              information is also necessary so that a resetting item can be
5138              compiled at the end of a group (if we are in a group). */
5139    
5140              *optionsptr = options = newoptions;
5141            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
5142            continue;              /* It is complete */            continue;              /* It is complete */
5143            }            }
# Line 4797  for (;; ptr++) Line 5253  for (;; ptr++)
5253    
5254      /* Error if hit end of pattern */      /* Error if hit end of pattern */
5255    
5256      if (*ptr != ')')      if (*ptr != CHAR_RIGHT_PARENTHESIS)
5257        {        {
5258        *errorcodeptr = ERR14;        *errorcodeptr = ERR14;
5259        goto FAILED;        goto FAILED;
# Line 4895  for (;; ptr++) Line 5351  for (;; ptr++)
5351      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5352      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5353    
5354      case '\\':      case CHAR_BACKSLASH:
5355      tempptr = ptr;      tempptr = ptr;
5356      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5357      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
# Line 4904  for (;; ptr++) Line 5360  for (;; ptr++)
5360        {        {
5361        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
5362          {          {
5363          if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5364            else inescq = TRUE;            ptr += 2;               /* avoid empty string */
5365                else inescq = TRUE;
5366          continue;          continue;
5367          }          }
5368    
# Line 4922  for (;; ptr++) Line 5379  for (;; ptr++)
5379        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5380        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5381    
5382          /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5383          is a subroutine call by number (Oniguruma syntax). In fact, the value
5384          -ESC_g is returned only for these cases. So we don't need to check for <
5385          or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5386          -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5387          that is a synonym for a named back reference). */
5388    
5389          if (-c == ESC_g)
5390            {
5391            const uschar *p;
5392            save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5393            terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5394              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5395    
5396            /* These two statements stop the compiler for warning about possibly
5397            unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5398            fact, because we actually check for a number below, the paths that
5399            would actually be in error are never taken. */
5400    
5401            skipbytes = 0;
5402            reset_bracount = FALSE;
5403    
5404            /* Test for a name */
5405    
5406            if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5407              {
5408              BOOL isnumber = TRUE;
5409              for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5410                {
5411                if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5412                if ((cd->ctypes[*p] & ctype_word) == 0) break;
5413                }
5414              if (*p != terminator)
5415                {
5416                *errorcodeptr = ERR57;
5417                break;
5418                }
5419              if (isnumber)
5420                {
5421                ptr++;
5422                goto HANDLE_NUMERICAL_RECURSION;
5423                }
5424              is_recurse = TRUE;
5425              goto NAMED_REF_OR_RECURSE;
5426              }
5427    
5428            /* Test a signed number in angle brackets or quotes. */
5429    
5430            p = ptr + 2;
5431            while ((digitab[*p] & ctype_digit) != 0) p++;
5432            if (*p != terminator)
5433              {
5434              *errorcodeptr = ERR57;
5435              break;
5436              }
5437            ptr++;
5438            goto HANDLE_NUMERICAL_RECURSION;
5439            }
5440    
5441        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5442        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
5443    
5444        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5445              ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5446          {          {
5447          is_recurse = FALSE;          is_recurse = FALSE;
5448          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5449              CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5450              CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5451          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
5452          }          }
5453    
# Line 5029  for (;; ptr++) Line 5548  for (;; ptr++)
5548      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;      *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5549      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];      for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5550    
5551        /* Remember if \r or \n were seen */
5552    
5553        if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5554          cd->external_flags |= PCRE_HASCRORLF;
5555    
5556      /* Set the first and required bytes appropriately. If no previous first      /* Set the first and required bytes appropriately. If no previous first
5557      byte, set it from this character, but revert to none on a zero repeat.      byte, set it from this character, but revert to none on a zero repeat.
5558      Otherwise, leave the firstbyte value alone, and don't change it on a zero      Otherwise, leave the firstbyte value alone, and don't change it on a zero
# Line 5271  for (;;) Line 5795  for (;;)
5795    compile a resetting op-code following, except at the very end of the pattern.    compile a resetting op-code following, except at the very end of the pattern.
5796    Return leaving the pointer at the terminating char. */    Return leaving the pointer at the terminating char. */
5797    
5798    if (*ptr != '|')    if (*ptr != CHAR_VERTICAL_LINE)
5799      {      {
5800      if (lengthptr == NULL)      if (lengthptr == NULL)
5801        {        {
# Line 5294  for (;;) Line 5818  for (;;)
5818    
5819      /* Resetting option if needed */      /* Resetting option if needed */
5820    
5821      if ((options & PCRE_IMS) != oldims && *ptr == ')')      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5822        {        {
5823        *code++ = OP_OPT;        *code++ = OP_OPT;
5824        *code++ = oldims;        *code++ = oldims;
# Line 5423  do { Line 5947  do {
5947       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5948       }       }
5949    
5950     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5951     are or may be referenced. */     it isn't in brackets that are or may be referenced. */
5952    
5953     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5954               op == OP_TYPEPOSSTAR) &&               op == OP_TYPEPOSSTAR))
             (*options & PCRE_DOTALL) != 0)  
5955       {       {
5956       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5957           return FALSE;
5958       }       }
5959    
5960     /* Check for explicit anchoring */     /* Check for explicit anchoring */
# Line 5476  do { Line 6000  do {
6000       NULL, 0, FALSE);       NULL, 0, FALSE);
6001     register int op = *scode;     register int op = *scode;
6002    
6003       /* If we are at the start of a conditional assertion group, *both* the
6004       conditional assertion *and* what follows the condition must satisfy the test
6005       for start of line. Other kinds of condition fail. Note that there may be an
6006       auto-callout at the start of a condition. */
6007    
6008       if (op == OP_COND)
6009         {
6010         scode += 1 + LINK_SIZE;
6011         if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6012         switch (*scode)
6013           {
6014           case OP_CREF:
6015           case OP_RREF:
6016           case OP_DEF:
6017           return FALSE;
6018    
6019           default:     /* Assertion */
6020           if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6021           do scode += GET(scode, 1); while (*scode == OP_ALT);
6022           scode += 1 + LINK_SIZE;
6023           break;
6024           }
6025         scode = first_significant_code(scode, NULL, 0, FALSE);
6026         op = *scode;
6027         }
6028    
6029     /* Non-capturing brackets */     /* Non-capturing brackets */
6030    
6031     if (op == OP_BRA)     if (op == OP_BRA)
# Line 5494  do { Line 6044  do {
6044    
6045     /* Other brackets */     /* Other brackets */
6046    
6047     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE)
6048       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       {
6049         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6050         }
6051    
6052     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
6053     may be referenced. */     may be referenced. */
# Line 5612  Returns: pointer to compiled data Line 6164  Returns: pointer to compiled data
6164                  with errorptr and erroroffset set                  with errorptr and erroroffset set
6165  */  */
6166    
6167  PCRE_EXP_DEFN pcre *  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6168  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
6169    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
6170  {  {
# Line 5620  return pcre_compile2(pattern, options, N Line 6172  return pcre_compile2(pattern, options, N
6172  }  }
6173    
6174    
6175  PCRE_EXP_DEFN pcre *  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6176  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6177    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
6178  {  {
# Line 5628  real_pcre *re; Line 6180  real_pcre *re;
6180  int length = 1;  /* For final END opcode */  int length = 1;  /* For final END opcode */
6181  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6182  int errorcode = 0;  int errorcode = 0;
6183    int skipatstart = 0;
6184  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
6185  BOOL utf8;  BOOL utf8;
6186  #endif  #endif
# Line 5646  to fill in forward references to subpatt Line 6199  to fill in forward references to subpatt
6199    
6200  uschar cworkspace[COMPILE_WORK_SIZE];  uschar cworkspace[COMPILE_WORK_SIZE];
6201    
   
6202  /* Set this early so that early errors get offset 0. */  /* Set this early so that early errors get offset 0. */
6203    
6204  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern;
# Line 5692  if ((options & PCRE_UTF8) != 0) Line 6244  if ((options & PCRE_UTF8) != 0)
6244    }    }
6245  #endif  #endif
6246    
6247  if ((options & ~PUBLIC_OPTIONS) != 0)  if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6248    {    {
6249    errorcode = ERR17;    errorcode = ERR17;
6250    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
# Line 5706  cd->fcc = tables + fcc_offset; Line 6258  cd->fcc = tables + fcc_offset;
6258  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6259  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6260    
6261    /* Check for global one-time settings at the start of the pattern, and remember
6262    the offset for later. */
6263    
6264    while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6265           ptr[skipatstart+1] == CHAR_ASTERISK)
6266      {
6267      int newnl = 0;
6268      int newbsr = 0;
6269    
6270      if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6271        { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6272      else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6273        { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6274      else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6275        { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6276      else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6277        { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6278      else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6279        { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6280    
6281      else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6282        { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6283      else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6284        { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6285    
6286      if (newnl != 0)
6287        options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6288      else if (newbsr != 0)
6289        options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6290      else break;
6291      }
6292    
6293    /* Check validity of \R options. */
6294    
6295    switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6296      {
6297      case 0:
6298      case PCRE_BSR_ANYCRLF:
6299      case PCRE_BSR_UNICODE:
6300      break;
6301      default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6302      }
6303    
6304  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
6305  current code allows for fixed one- or two-byte sequences, plus "any" and  current code allows for fixed one- or two-byte sequences, plus "any" and
6306  "anycrlf". */  "anycrlf". */
6307    
6308  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & PCRE_NEWLINE_BITS)
6309    {    {
6310    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Build-time default */
6311    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6312    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6313    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
6314         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6315    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
6316    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6317    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
# Line 5765  to compile parts of the pattern into; th Line 6360  to compile parts of the pattern into; th
6360  no longer needed, so hopefully this workspace will never overflow, though there  no longer needed, so hopefully this workspace will never overflow, though there
6361  is a test for its doing so. */  is a test for its doing so. */
6362    
6363  cd->bracount = 0;  cd->bracount = cd->final_bracount = 0;
6364  cd->names_found = 0;  cd->names_found = 0;
6365  cd->name_entry_size = 0;  cd->name_entry_size = 0;
6366  cd->name_table = NULL;  cd->name_table = NULL;
# Line 5775  cd->hwm = cworkspace; Line 6370  cd->hwm = cworkspace;
6370  cd->start_pattern = (const uschar *)pattern;  cd->start_pattern = (const uschar *)pattern;
6371  cd->end_pattern = (const uschar *)(pattern + strlen(pattern));  cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6372  cd->req_varyopt = 0;  cd->req_varyopt = 0;
 cd->nopartial = FALSE;  
6373  cd->external_options = options;  cd->external_options = options;
6374    cd->external_flags = 0;
6375    
6376  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6377  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 5784  been put into the cd block so that they Line 6379  been put into the cd block so that they
6379  found within the regex right at the beginning. Bringing initial option settings  found within the regex right at the beginning. Bringing initial option settings
6380  outside can help speed up starting point checks. */  outside can help speed up starting point checks. */
6381    
6382    ptr += skipatstart;
6383  code = cworkspace;  code = cworkspace;
6384  *code = OP_BRA;  *code = OP_BRA;
6385  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
# Line 5814  if (re == NULL) Line 6410  if (re == NULL)
6410    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN;
6411    }    }
6412    
6413  /* Put in the magic number, and save the sizes, initial options, and character  /* Put in the magic number, and save the sizes, initial options, internal
6414  table pointer. NULL is used for the default character tables. The nullpad field  flags, and character table pointer. NULL is used for the default character
6415  is at the end; it's there to help in the case when a regex compiled on a system  tables. The nullpad field is at the end; it's there to help in the case when a
6416  with 4-byte pointers is run on another with 8-byte pointers. */  regex compiled on a system with 4-byte pointers is run on another with 8-byte
6417    pointers. */
6418    
6419  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
6420  re->size = size;  re->size = size;
6421  re->options = cd->external_options;  re->options = cd->external_options;
6422    re->flags = cd->external_flags;
6423  re->dummy1 = 0;  re->dummy1 = 0;
6424  re->first_byte = 0;  re->first_byte = 0;
6425  re->req_byte = 0;  re->req_byte = 0;
# Line 5839  field. Reset the bracket count and the n Line 6437  field. Reset the bracket count and the n
6437  field; this time it's used for remembering forward references to subpatterns.  field; this time it's used for remembering forward references to subpatterns.
6438  */  */
6439    
6440    cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6441  cd->bracount = 0;  cd->bracount = 0;
6442  cd->names_found = 0;  cd->names_found = 0;
6443  cd->name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;
# Line 5846  codestart = cd->name_table + re->name_en Line 6445  codestart = cd->name_table + re->name_en
6445  cd->start_code = codestart;  cd->start_code = codestart;
6446  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6447  cd->req_varyopt = 0;  cd->req_varyopt = 0;
 cd->nopartial = FALSE;  
6448  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6449    
6450  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6451  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
6452  of the function here. */  of the function here. */
6453    
6454  ptr = (const uschar *)pattern;  ptr = (const uschar *)pattern + skipatstart;
6455  code = (uschar *)codestart;  code = (uschar *)codestart;
6456  *code = OP_BRA;  *code = OP_BRA;
6457  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6458    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6459  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
6460  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
6461    re->flags = cd->external_flags;
6462    
 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;  
6463  if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6464    
6465  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
# Line 5904  if (errorcode != 0) Line 6502  if (errorcode != 0)
6502    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
6503    *erroroffset = ptr - (const uschar *)pattern;    *erroroffset = ptr - (const uschar *)pattern;
6504    PCRE_EARLY_ERROR_RETURN2:    PCRE_EARLY_ERROR_RETURN2:
6505    *errorptr = error_texts[errorcode];    *errorptr = find_error_text(errorcode);
6506    if (errorcodeptr != NULL) *errorcodeptr = errorcode;    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6507    return NULL;    return NULL;
6508    }    }
# Line 5933  if ((re->options & PCRE_ANCHORED) == 0) Line 6531  if ((re->options & PCRE_ANCHORED) == 0)
6531        int ch = firstbyte & 255;        int ch = firstbyte & 255;
6532        re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&        re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6533           cd->fcc[ch] == ch)? ch : firstbyte;           cd->fcc[ch] == ch)? ch : firstbyte;
6534        re->options |= PCRE_FIRSTSET;        re->flags |= PCRE_FIRSTSET;
6535        }        }
6536      else if (is_startline(codestart, 0, cd->backref_map))      else if (is_startline(codestart, 0, cd->backref_map))
6537        re->options |= PCRE_STARTLINE;        re->flags |= PCRE_STARTLINE;
6538      }      }
6539    }    }
6540    
# Line 5950  if (reqbyte >= 0 && Line 6548  if (reqbyte >= 0 &&
6548    int ch = reqbyte & 255;    int ch = reqbyte & 255;
6549    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6550      cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;      cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6551    re->options |= PCRE_REQCHSET;    re->flags |= PCRE_REQCHSET;
6552    }    }
6553    
6554  /* Print out the compiled data if debugging is enabled. This is never the  /* Print out the compiled data if debugging is enabled. This is never the
# Line 5961  case when building a production library. Line 6559  case when building a production library.
6559  printf("Length = %d top_bracket = %d top_backref = %d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
6560    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
6561    
6562  if (re->options != 0)  printf("Options=%08x\n", re->options);
   {  
   printf("%s%s%s%s%s%s%s%s%s\n",  
     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",  
     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",  
     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",  
     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",  
     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",  
     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",  
     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",  
     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",  
     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");  
   }  
6563    
6564  if ((re->options & PCRE_FIRSTSET) != 0)  if ((re->flags & PCRE_FIRSTSET) != 0)
6565    {    {
6566    int ch = re->first_byte & 255;    int ch = re->first_byte & 255;
6567    const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?    const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
# Line 5984  if ((re->options & PCRE_FIRSTSET) != 0) Line 6570  if ((re->options & PCRE_FIRSTSET) != 0)
6570      else printf("First char = \\x%02x%s\n", ch, caseless);      else printf("First char = \\x%02x%s\n", ch, caseless);
6571    }    }
6572    
6573  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
6574    {    {
6575    int ch = re->req_byte & 255;    int ch = re->req_byte & 255;
6576    const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?    const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
# Line 6001  was compiled can be seen. */ Line 6587  was compiled can be seen. */
6587  if (code - codestart > length)  if (code - codestart > length)
6588    {    {
6589    (pcre_free)(re);    (pcre_free)(re);
6590    *errorptr = error_texts[ERR23];    *errorptr = find_error_text(ERR23);
6591    *erroroffset = ptr - (uschar *)pattern;    *erroroffset = ptr - (uschar *)pattern;
6592    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6593    return NULL;    return NULL;

Legend:
Removed from v.221  
changed lines
  Added in v.408

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12