/[pcre]/code/tags/pcre-7.1/pcre_compile.c
ViewVC logotype

Diff of /code/tags/pcre-7.1/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 107 by ph10, Wed Mar 7 11:02:28 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 53  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 82  are simple data values; negative values
82  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
83  is invalid. */  is invalid. */
84    
85  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
86  static const short int escapes[] = {  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
98    
99  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
100  static const short int escapes[] = {  static const short int escapes[] = {
101  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
102  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 106  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 155  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 171  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 181  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 190  static const char *error_texts[] = { Line 205  static const char *error_texts[] = {
205    "unrecognized character after (?<",    "unrecognized character after (?<",
206    /* 25 */    /* 25 */
207    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
208    "malformed number after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?digits must be followed by )",
# Line 198  static const char *error_texts[] = { Line 213  static const char *error_texts[] = {
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 209  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named groups have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
234      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236      /* 50 */
237      "repeated subpattern is too long",
238      "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by an (optionally braced) non-zero number"
246  };  };
247    
248    
# Line 235  For convenience, we use the same bit def Line 262  For convenience, we use the same bit def
262    
263  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
264    
265  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
266  static const unsigned char digitab[] =  static const unsigned char digitab[] =
267    {    {
268    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 298  static const unsigned char digitab[] =
298    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300    
301  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
302  static const unsigned char digitab[] =  static const unsigned char digitab[] =
303    {    {
304    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 312  static const unsigned char digitab[] =
312    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
313    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
316    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
317    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 346  static const unsigned char ebcdic_charta
346    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
347    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
348    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
349    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
350    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
351    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
352    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 373  static const unsigned char ebcdic_charta
373  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
374    
375  static BOOL  static BOOL
376    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
378    
379    
380    
# Line 357  static BOOL Line 384  static BOOL
384    
385  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
386  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
387  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
388  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390    ptr is pointing at the \. On exit, it is on the final character of the escape
391    sequence.
392    
393  Arguments:  Arguments:
394    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 392  if (c == 0) *errorcodeptr = ERR1; Line 421  if (c == 0) *errorcodeptr = ERR1;
421  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
422  Otherwise further processing may be required. */  Otherwise further processing may be required. */
423    
424  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
425  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
426  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
427    
428  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
429  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
430  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
431  #endif  #endif
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 435  else if ((i = escapes[c - 0x48]) != 0)
435  else  else
436    {    {
437    const uschar *oldptr;    const uschar *oldptr;
438      BOOL braced, negated;
439    
440    switch (c)    switch (c)
441      {      {
442      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 450  else
450      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
451      break;      break;
452    
453        /* \g must be followed by a number, either plain or braced. If positive, it
454        is an absolute backreference. If negative, it is a relative backreference.
455        This is a Perl 5.10 feature. */
456    
457        case 'g':
458        if (ptr[1] == '{')
459          {
460          braced = TRUE;
461          ptr++;
462          }
463        else braced = FALSE;
464    
465        if (ptr[1] == '-')
466          {
467          negated = TRUE;
468          ptr++;
469          }
470        else negated = FALSE;
471    
472        c = 0;
473        while ((digitab[ptr[1]] & ctype_digit) != 0)
474          c = c * 10 + *(++ptr) - '0';
475    
476        if (c == 0 || (braced && *(++ptr) != '}'))
477          {
478          *errorcodeptr = ERR57;
479          return 0;
480          }
481    
482        if (negated)
483          {
484          if (c > bracount)
485            {
486            *errorcodeptr = ERR15;
487            return 0;
488            }
489          c = bracount - (c - 1);
490          }
491    
492        c = -(ESC_REF + c);
493        break;
494    
495      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
496      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
497      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 460  else Line 533  else
533        }        }
534    
535      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
536      larger first octal digit. */      larger first octal digit. The original code used just to take the least
537        significant 8 bits of octal numbers (I think this is what early Perls used
538        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539        than 3 octal digits. */
540    
541      case '0':      case '0':
542      c -= '0';      c -= '0';
543      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
545      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
546      break;      break;
547    
548      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 562  else
562          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
563          count++;          count++;
564    
565  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
566          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
567          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
569          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
570          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571  #endif  #endif
# Line 513  else Line 589  else
589        {        {
590        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
591        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
592  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
593        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
594        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
596        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
597        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598  #endif  #endif
599        }        }
600      break;      break;
601    
602      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603        This coding is ASCII-specific, but then the whole concept of \cx is
604        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605    
606      case 'c':      case 'c':
607      c = *(++ptr);      c = *(++ptr);
# Line 533  else Line 611  else
611        return 0;        return 0;
612        }        }
613    
614      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
615      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
616      c ^= 0x40;      c ^= 0x40;
617  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
618      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
619      c ^= 0xC0;      c ^= 0xC0;
620  #endif  #endif
# Line 763  return p; Line 837  return p;
837    
838    
839  /*************************************************  /*************************************************
840    *       Find forward referenced subpattern       *
841    *************************************************/
842    
843    /* This function scans along a pattern's text looking for capturing
844    subpatterns, and counting them. If it finds a named pattern that matches the
845    name it is given, it returns its number. Alternatively, if the name is NULL, it
846    returns when it reaches a given numbered subpattern. This is used for forward
847    references to subpatterns. We know that if (?P< is encountered, the name will
848    be terminated by '>' because that is checked in the first pass.
849    
850    Arguments:
851      ptr          current position in the pattern
852      count        current count of capturing parens so far encountered
853      name         name to seek, or NULL if seeking a numbered subpattern
854      lorn         name length, or subpattern number if name is NULL
855      xmode        TRUE if we are in /x mode
856    
857    Returns:       the number of the named subpattern, or -1 if not found
858    */
859    
860    static int
861    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862      BOOL xmode)
863    {
864    const uschar *thisname;
865    
866    for (; *ptr != 0; ptr++)
867      {
868      int term;
869    
870      /* Skip over backslashed characters and also entire \Q...\E */
871    
872      if (*ptr == '\\')
873        {
874        if (*(++ptr) == 0) return -1;
875        if (*ptr == 'Q') for (;;)
876          {
877          while (*(++ptr) != 0 && *ptr != '\\');
878          if (*ptr == 0) return -1;
879          if (*(++ptr) == 'E') break;
880          }
881        continue;
882        }
883    
884      /* Skip over character classes */
885    
886      if (*ptr == '[')
887        {
888        while (*(++ptr) != ']')
889          {
890          if (*ptr == '\\')
891            {
892            if (*(++ptr) == 0) return -1;
893            if (*ptr == 'Q') for (;;)
894              {
895              while (*(++ptr) != 0 && *ptr != '\\');
896              if (*ptr == 0) return -1;
897              if (*(++ptr) == 'E') break;
898              }
899            continue;
900            }
901          }
902        continue;
903        }
904    
905      /* Skip comments in /x mode */
906    
907      if (xmode && *ptr == '#')
908        {
909        while (*(++ptr) != 0 && *ptr != '\n');
910        if (*ptr == 0) return -1;
911        continue;
912        }
913    
914      /* An opening parens must now be a real metacharacter */
915    
916      if (*ptr != '(') continue;
917      if (ptr[1] != '?')
918        {
919        count++;
920        if (name == NULL && count == lorn) return count;
921        continue;
922        }
923    
924      ptr += 2;
925      if (*ptr == 'P') ptr++;                      /* Allow optional P */
926    
927      /* We have to disambiguate (?<! and (?<= from (?<name> */
928    
929      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930           *ptr != '\'')
931        continue;
932    
933      count++;
934    
935      if (name == NULL && count == lorn) return count;
936      term = *ptr++;
937      if (term == '<') term = '>';
938      thisname = ptr;
939      while (*ptr != term) ptr++;
940      if (name != NULL && lorn == ptr - thisname &&
941          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942        return count;
943      }
944    
945    return -1;
946    }
947    
948    
949    
950    /*************************************************
951  *      Find first significant op code            *  *      Find first significant op code            *
952  *************************************************/  *************************************************/
953    
# Line 811  for (;;) Line 996  for (;;)
996    
997      case OP_CALLOUT:      case OP_CALLOUT:
998      case OP_CREF:      case OP_CREF:
999      case OP_BRANUMBER:      case OP_RREF:
1000        case OP_DEF:
1001      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1002      break;      break;
1003    
# Line 856  for (;;) Line 1042  for (;;)
1042    {    {
1043    int d;    int d;
1044    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1045    
1046    switch (op)    switch (op)
1047      {      {
1048        case OP_CBRA:
1049      case OP_BRA:      case OP_BRA:
1050      case OP_ONCE:      case OP_ONCE:
1051      case OP_COND:      case OP_COND:
1052      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053      if (d < 0) return d;      if (d < 0) return d;
1054      branchlength += d;      branchlength += d;
1055      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1084  for (;;)
1084      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1085    
1086      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1087      case OP_CREF:      case OP_CREF:
1088        case OP_RREF:
1089        case OP_DEF:
1090      case OP_OPT:      case OP_OPT:
1091      case OP_CALLOUT:      case OP_CALLOUT:
1092      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1104  for (;;)
1104    
1105      case OP_CHAR:      case OP_CHAR:
1106      case OP_CHARNC:      case OP_CHARNC:
1107        case OP_NOT:
1108      branchlength++;      branchlength++;
1109      cc += 2;      cc += 2;
1110  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1031  Returns: pointer to the opcode for Line 1219  Returns: pointer to the opcode for
1219  static const uschar *  static const uschar *
1220  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1221  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1222  for (;;)  for (;;)
1223    {    {
1224    register int c = *code;    register int c = *code;
1225    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1226    else if (c > OP_BRA)  
1227      /* XCLASS is used for classes that cannot be represented just by a bit
1228      map. This includes negated single high-valued characters. The length in
1229      the table is zero; the actual length is stored in the compiled code. */
1230    
1231      if (c == OP_XCLASS) code += GET(code, 1);
1232    
1233      /* Handle capturing bracket */
1234    
1235      else if (c == OP_CBRA)
1236      {      {
1237      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1238      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1239      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1240      }      }
1241    
1242      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243      a multi-byte character. The length in the table is a minimum, so we have to
1244      arrange to skip the extra bytes. */
1245    
1246    else    else
1247      {      {
1248      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1249  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1250      if (utf8) switch(c)      if (utf8) switch(c)
1251        {        {
1252        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1254  for (;;)
1254        case OP_EXACT:        case OP_EXACT:
1255        case OP_UPTO:        case OP_UPTO:
1256        case OP_MINUPTO:        case OP_MINUPTO:
1257          case OP_POSUPTO:
1258        case OP_STAR:        case OP_STAR:
1259        case OP_MINSTAR:        case OP_MINSTAR:
1260          case OP_POSSTAR:
1261        case OP_PLUS:        case OP_PLUS:
1262        case OP_MINPLUS:        case OP_MINPLUS:
1263          case OP_POSPLUS:
1264        case OP_QUERY:        case OP_QUERY:
1265        case OP_MINQUERY:        case OP_MINQUERY:
1266        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1267        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1268        break;        break;
1269        }        }
1270  #endif  #endif
1271      }      }
1272    }    }
1273  }  }
# Line 1105  Returns: pointer to the opcode for Line 1291  Returns: pointer to the opcode for
1291  static const uschar *  static const uschar *
1292  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1293  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1294  for (;;)  for (;;)
1295    {    {
1296    register int c = *code;    register int c = *code;
1297    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1298    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1299    else if (c > OP_BRA)  
1300      {    /* XCLASS is used for classes that cannot be represented just by a bit
1301      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1302      }    the table is zero; the actual length is stored in the compiled code. */
1303    
1304      if (c == OP_XCLASS) code += GET(code, 1);
1305    
1306      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307      that are followed by a character may be followed by a multi-byte character.
1308      The length in the table is a minimum, so we have to arrange to skip the extra
1309      bytes. */
1310    
1311    else    else
1312      {      {
1313      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1314  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1315      if (utf8) switch(c)      if (utf8) switch(c)
1316        {        {
1317        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1319  for (;;)
1319        case OP_EXACT:        case OP_EXACT:
1320        case OP_UPTO:        case OP_UPTO:
1321        case OP_MINUPTO:        case OP_MINUPTO:
1322          case OP_POSUPTO:
1323        case OP_STAR:        case OP_STAR:
1324        case OP_MINSTAR:        case OP_MINSTAR:
1325          case OP_POSSTAR:
1326        case OP_PLUS:        case OP_PLUS:
1327        case OP_MINPLUS:        case OP_MINPLUS:
1328          case OP_POSPLUS:
1329        case OP_QUERY:        case OP_QUERY:
1330        case OP_MINQUERY:        case OP_MINQUERY:
1331        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1332        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1333        break;        break;
1334        }        }
1335  #endif  #endif
1336      }      }
1337    }    }
1338  }  }
# Line 1165  for (;;) Line 1344  for (;;)
1344  *************************************************/  *************************************************/
1345    
1346  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1347  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1348  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1349  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1350  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1351    struck an inner bracket whose current branch will already have been scanned.
1352    
1353  Arguments:  Arguments:
1354    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1362  static BOOL
1362  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1363  {  {
1364  register int c;  register int c;
1365  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1366       code < endcode;       code < endcode;
1367       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1368    {    {
# Line 1190  for (code = first_significant_code(code Line 1370  for (code = first_significant_code(code
1370    
1371    c = *code;    c = *code;
1372    
1373    if (c >= OP_BRA)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1374      {      {
1375      BOOL empty_branch;      BOOL empty_branch;
1376      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1386  for (code = first_significant_code(code
1386        }        }
1387      while (*code == OP_ALT);      while (*code == OP_ALT);
1388      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1389      code += 1 + LINK_SIZE;  
1390      c = *code;      /* Move past the KET and fudge things so that the increment in the "for"
1391        above has no effect. */
1392    
1393        c = OP_END;
1394        code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1395        continue;
1396      }      }
1397    
1398    else switch (c)    /* Handle the other opcodes */
1399    
1400      switch (c)
1401      {      {
1402      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1403    
# Line 1266  for (code = first_significant_code(code Line 1453  for (code = first_significant_code(code
1453      case OP_NOT:      case OP_NOT:
1454      case OP_PLUS:      case OP_PLUS:
1455      case OP_MINPLUS:      case OP_MINPLUS:
1456        case OP_POSPLUS:
1457      case OP_EXACT:      case OP_EXACT:
1458      case OP_NOTPLUS:      case OP_NOTPLUS:
1459      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1460        case OP_NOTPOSPLUS:
1461      case OP_NOTEXACT:      case OP_NOTEXACT:
1462      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1463      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1464        case OP_TYPEPOSPLUS:
1465      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1466      return FALSE;      return FALSE;
1467    
# Line 1283  for (code = first_significant_code(code Line 1473  for (code = first_significant_code(code
1473      case OP_ALT:      case OP_ALT:
1474      return TRUE;      return TRUE;
1475    
1476      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1477      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1478    
1479  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1480      case OP_STAR:      case OP_STAR:
1481      case OP_MINSTAR:      case OP_MINSTAR:
1482        case OP_POSSTAR:
1483      case OP_QUERY:      case OP_QUERY:
1484      case OP_MINQUERY:      case OP_MINQUERY:
1485        case OP_POSQUERY:
1486      case OP_UPTO:      case OP_UPTO:
1487      case OP_MINUPTO:      case OP_MINUPTO:
1488        case OP_POSUPTO:
1489      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1490      break;      break;
1491  #endif  #endif
# Line 1410  earlier groups that are outside the curr Line 1603  earlier groups that are outside the curr
1603  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1604  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1605  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1606  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1607  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1608    
1609    This function has been extended with the possibility of forward references for
1610    recursions and subroutine calls. It must also check the list of such references
1611    for the group we are dealing with. If it finds that one of the recursions in
1612    the current group is on this list, it adjusts the offset in the list, not the
1613    value in the reference (which is a group number).
1614    
1615  Arguments:  Arguments:
1616    group      points to the start of the group    group      points to the start of the group
1617    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1618    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1619    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1620      save_hwm   the hwm forward reference pointer at the start of the group
1621    
1622  Returns:     nothing  Returns:     nothing
1623  */  */
1624    
1625  static void  static void
1626  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1627      uschar *save_hwm)
1628  {  {
1629  uschar *ptr = group;  uschar *ptr = group;
1630  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1631    {    {
1632    int offset = GET(ptr, 1);    int offset;
1633    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1634    
1635      /* See if this recursion is on the forward reference list. If so, adjust the
1636      reference. */
1637    
1638      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1639        {
1640        offset = GET(hc, 0);
1641        if (cd->start_code + offset == ptr + 1)
1642          {
1643          PUT(hc, 0, offset + adjust);
1644          break;
1645          }
1646        }
1647    
1648      /* Otherwise, adjust the recursion offset if it's after the start of this
1649      group. */
1650    
1651      if (hc >= cd->hwm)
1652        {
1653        offset = GET(ptr, 1);
1654        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1655        }
1656    
1657    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1658    }    }
1659  }  }
# Line 1508  Yield: TRUE when range returned; Line 1732  Yield: TRUE when range returned;
1732  */  */
1733    
1734  static BOOL  static BOOL
1735  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1736      unsigned int *odptr)
1737  {  {
1738  int c, othercase, next;  unsigned int c, othercase, next;
1739    
1740  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1741    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1742    
1743  if (c > d) return FALSE;  if (c > d) return FALSE;
1744    
# Line 1534  return TRUE; Line 1759  return TRUE;
1759  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1760    
1761    
1762    
1763    /*************************************************
1764    *     Check if auto-possessifying is possible    *
1765    *************************************************/
1766    
1767    /* This function is called for unlimited repeats of certain items, to see
1768    whether the next thing could possibly match the repeated item. If not, it makes
1769    sense to automatically possessify the repeated item.
1770    
1771    Arguments:
1772      op_code       the repeated op code
1773      this          data for this item, depends on the opcode
1774      utf8          TRUE in UTF-8 mode
1775      utf8_char     used for utf8 character bytes, NULL if not relevant
1776      ptr           next character in pattern
1777      options       options bits
1778      cd            contains pointers to tables etc.
1779    
1780    Returns:        TRUE if possessifying is wanted
1781    */
1782    
1783    static BOOL
1784    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1785      const uschar *ptr, int options, compile_data *cd)
1786    {
1787    int next;
1788    
1789    /* Skip whitespace and comments in extended mode */
1790    
1791    if ((options & PCRE_EXTENDED) != 0)
1792      {
1793      for (;;)
1794        {
1795        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1796        if (*ptr == '#')
1797          {
1798          while (*(++ptr) != 0)
1799            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1800          }
1801        else break;
1802        }
1803      }
1804    
1805    /* If the next item is one that we can handle, get its value. A non-negative
1806    value is a character, a negative value is an escape value. */
1807    
1808    if (*ptr == '\\')
1809      {
1810      int temperrorcode = 0;
1811      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1812      if (temperrorcode != 0) return FALSE;
1813      ptr++;    /* Point after the escape sequence */
1814      }
1815    
1816    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1817      {
1818    #ifdef SUPPORT_UTF8
1819      if (utf8) { GETCHARINC(next, ptr); } else
1820    #endif
1821      next = *ptr++;
1822      }
1823    
1824    else return FALSE;
1825    
1826    /* Skip whitespace and comments in extended mode */
1827    
1828    if ((options & PCRE_EXTENDED) != 0)
1829      {
1830      for (;;)
1831        {
1832        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1833        if (*ptr == '#')
1834          {
1835          while (*(++ptr) != 0)
1836            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1837          }
1838        else break;
1839        }
1840      }
1841    
1842    /* If the next thing is itself optional, we have to give up. */
1843    
1844    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1845      return FALSE;
1846    
1847    /* Now compare the next item with the previous opcode. If the previous is a
1848    positive single character match, "item" either contains the character or, if
1849    "item" is greater than 127 in utf8 mode, the character's bytes are in
1850    utf8_char. */
1851    
1852    
1853    /* Handle cases when the next item is a character. */
1854    
1855    if (next >= 0) switch(op_code)
1856      {
1857      case OP_CHAR:
1858    #ifdef SUPPORT_UTF8
1859      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1860    #endif
1861      return item != next;
1862    
1863      /* For CHARNC (caseless character) we must check the other case. If we have
1864      Unicode property support, we can use it to test the other case of
1865      high-valued characters. */
1866    
1867      case OP_CHARNC:
1868    #ifdef SUPPORT_UTF8
1869      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1870    #endif
1871      if (item == next) return FALSE;
1872    #ifdef SUPPORT_UTF8
1873      if (utf8)
1874        {
1875        unsigned int othercase;
1876        if (next < 128) othercase = cd->fcc[next]; else
1877    #ifdef SUPPORT_UCP
1878        othercase = _pcre_ucp_othercase((unsigned int)next);
1879    #else
1880        othercase = NOTACHAR;
1881    #endif
1882        return (unsigned int)item != othercase;
1883        }
1884      else
1885    #endif  /* SUPPORT_UTF8 */
1886      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1887    
1888      /* For OP_NOT, "item" must be a single-byte character. */
1889    
1890      case OP_NOT:
1891      if (next < 0) return FALSE;  /* Not a character */
1892      if (item == next) return TRUE;
1893      if ((options & PCRE_CASELESS) == 0) return FALSE;
1894    #ifdef SUPPORT_UTF8
1895      if (utf8)
1896        {
1897        unsigned int othercase;
1898        if (next < 128) othercase = cd->fcc[next]; else
1899    #ifdef SUPPORT_UCP
1900        othercase = _pcre_ucp_othercase(next);
1901    #else
1902        othercase = NOTACHAR;
1903    #endif
1904        return (unsigned int)item == othercase;
1905        }
1906      else
1907    #endif  /* SUPPORT_UTF8 */
1908      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1909    
1910      case OP_DIGIT:
1911      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1912    
1913      case OP_NOT_DIGIT:
1914      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1915    
1916      case OP_WHITESPACE:
1917      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1918    
1919      case OP_NOT_WHITESPACE:
1920      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1921    
1922      case OP_WORDCHAR:
1923      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1924    
1925      case OP_NOT_WORDCHAR:
1926      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1927    
1928      default:
1929      return FALSE;
1930      }
1931    
1932    
1933    /* Handle the case when the next item is \d, \s, etc. */
1934    
1935    switch(op_code)
1936      {
1937      case OP_CHAR:
1938      case OP_CHARNC:
1939    #ifdef SUPPORT_UTF8
1940      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941    #endif
1942      switch(-next)
1943        {
1944        case ESC_d:
1945        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1946    
1947        case ESC_D:
1948        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1949    
1950        case ESC_s:
1951        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1952    
1953        case ESC_S:
1954        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1955    
1956        case ESC_w:
1957        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1958    
1959        case ESC_W:
1960        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1961    
1962        default:
1963        return FALSE;
1964        }
1965    
1966      case OP_DIGIT:
1967      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1968    
1969      case OP_NOT_DIGIT:
1970      return next == -ESC_d;
1971    
1972      case OP_WHITESPACE:
1973      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1974    
1975      case OP_NOT_WHITESPACE:
1976      return next == -ESC_s;
1977    
1978      case OP_WORDCHAR:
1979      return next == -ESC_W || next == -ESC_s;
1980    
1981      case OP_NOT_WORDCHAR:
1982      return next == -ESC_w || next == -ESC_d;
1983    
1984      default:
1985      return FALSE;
1986      }
1987    
1988    /* Control does not reach here */
1989    }
1990    
1991    
1992    
1993  /*************************************************  /*************************************************
1994  *           Compile one branch                   *  *           Compile one branch                   *
1995  *************************************************/  *************************************************/
1996    
1997  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
1998  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
1999  bits.  bits. This function is used during the pre-compile phase when we are trying
2000    to find out the amount of memory needed, as well as during the real compile
2001    phase. The value of lengthptr distinguishes the two phases.
2002    
2003  Arguments:  Arguments:
2004    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2005    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2006    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2007    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1552  Arguments: Line 2009  Arguments:
2009    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2010    bcptr          points to current branch chain    bcptr          points to current branch chain
2011    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2012      lengthptr      NULL during the real compile phase
2013                     points to length accumulator during pre-compile phase
2014    
2015  Returns:         TRUE on success  Returns:         TRUE on success
2016                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2017  */  */
2018    
2019  static BOOL  static BOOL
2020  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2021    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2022    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2023  {  {
2024  int repeat_type, op_type;  int repeat_type, op_type;
2025  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1569  int greedy_default, greedy_non_default; Line 2028  int greedy_default, greedy_non_default;
2028  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2029  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2030  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2031  int options = *optionsptr;  int options = *optionsptr;
2032  int after_manual_callout = 0;  int after_manual_callout = 0;
2033    int length_prevgroup = 0;
2034  register int c;  register int c;
2035  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2036    uschar *last_code = code;
2037    uschar *orig_code = code;
2038  uschar *tempcode;  uschar *tempcode;
2039  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2040  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1581  const uschar *ptr = *ptrptr; Line 2042  const uschar *ptr = *ptrptr;
2042  const uschar *tempptr;  const uschar *tempptr;
2043  uschar *previous = NULL;  uschar *previous = NULL;
2044  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2045    uschar *save_hwm = NULL;
2046  uschar classbits[32];  uschar classbits[32];
2047    
2048  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1590  uschar *class_utf8data; Line 2052  uschar *class_utf8data;
2052  uschar utf8_char[6];  uschar utf8_char[6];
2053  #else  #else
2054  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2055    uschar *utf8_char = NULL;
2056    #endif
2057    
2058    #ifdef DEBUG
2059    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2060  #endif  #endif
2061    
2062  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1623  for (;; ptr++) Line 2090  for (;; ptr++)
2090    BOOL negate_class;    BOOL negate_class;
2091    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2092    BOOL is_quantifier;    BOOL is_quantifier;
2093      BOOL is_recurse;
2094    int class_charcount;    int class_charcount;
2095    int class_lastchar;    int class_lastchar;
2096    int newoptions;    int newoptions;
# Line 1630  for (;; ptr++) Line 2098  for (;; ptr++)
2098    int skipbytes;    int skipbytes;
2099    int subreqbyte;    int subreqbyte;
2100    int subfirstbyte;    int subfirstbyte;
2101      int terminator;
2102    int mclength;    int mclength;
2103    uschar mcbuffer[8];    uschar mcbuffer[8];
2104    
2105    /* Next byte in the pattern */    /* Get next byte in the pattern */
2106    
2107    c = *ptr;    c = *ptr;
2108    
2109    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If we are in the pre-compile phase, accumulate the length used for the
2110      previous cycle of this loop. */
2111    
2112    if (inescq && c != 0)    if (lengthptr != NULL)
2113      {      {
2114      if (c == '\\' && ptr[1] == 'E')  #ifdef DEBUG
2115        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2116    #endif
2117        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2118        {        {
2119        inescq = FALSE;        *errorcodeptr = ERR52;
2120        ptr++;        goto FAILED;
       continue;  
2121        }        }
2122      else  
2123        /* There is at least one situation where code goes backwards: this is the
2124        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2125        the class is simply eliminated. However, it is created first, so we have to
2126        allow memory for it. Therefore, don't ever reduce the length at this point.
2127        */
2128    
2129        if (code < last_code) code = last_code;
2130        *lengthptr += code - last_code;
2131        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2132    
2133        /* If "previous" is set and it is not at the start of the work space, move
2134        it back to there, in order to avoid filling up the work space. Otherwise,
2135        if "previous" is NULL, reset the current code pointer to the start. */
2136    
2137        if (previous != NULL)
2138          {
2139          if (previous > orig_code)
2140            {
2141            memmove(orig_code, previous, code - previous);
2142            code -= previous - orig_code;
2143            previous = orig_code;
2144            }
2145          }
2146        else code = orig_code;
2147    
2148        /* Remember where this code item starts so we can pick up the length
2149        next time round. */
2150    
2151        last_code = code;
2152        }
2153    
2154      /* In the real compile phase, just check the workspace used by the forward
2155      reference list. */
2156    
2157      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2158        {
2159        *errorcodeptr = ERR52;
2160        goto FAILED;
2161        }
2162    
2163      /* If in \Q...\E, check for the end; if not, we have a literal */
2164    
2165      if (inescq && c != 0)
2166        {
2167        if (c == '\\' && ptr[1] == 'E')
2168          {
2169          inescq = FALSE;
2170          ptr++;
2171          continue;
2172          }
2173        else
2174        {        {
2175        if (previous_callout != NULL)        if (previous_callout != NULL)
2176          {          {
2177          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2178              complete_callout(previous_callout, ptr, cd);
2179          previous_callout = NULL;          previous_callout = NULL;
2180          }          }
2181        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2196  for (;; ptr++)
2196    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2197         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2198      {      {
2199      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2200          complete_callout(previous_callout, ptr, cd);
2201      previous_callout = NULL;      previous_callout = NULL;
2202      }      }
2203    
# Line 1683  for (;; ptr++) Line 2208  for (;; ptr++)
2208      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2209      if (c == '#')      if (c == '#')
2210        {        {
2211        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2212        on the Macintosh. */          {
2213        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2214        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2215          if (*ptr != 0) continue;
2216    
2217          /* Else fall through to handle end of string */
2218          c = 0;
2219        }        }
2220      }      }
2221    
# Line 1700  for (;; ptr++) Line 2229  for (;; ptr++)
2229    
2230    switch(c)    switch(c)
2231      {      {
2232      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2233        case 0:                        /* The branch terminates at string end */
2234      case 0:      case '|':                      /* or | or ) */
     case '|':  
2235      case ')':      case ')':
2236      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2237      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2238      *codeptr = code;      *codeptr = code;
2239      *ptrptr = ptr;      *ptrptr = ptr;
2240        if (lengthptr != NULL)
2241          {
2242          *lengthptr += code - last_code;   /* To include callout length */
2243          DPRINTF((">> end branch\n"));
2244          }
2245      return TRUE;      return TRUE;
2246    
2247    
2248        /* ===================================================================*/
2249      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2250      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2251    
# Line 1739  for (;; ptr++) Line 2274  for (;; ptr++)
2274      *code++ = OP_ANY;      *code++ = OP_ANY;
2275      break;      break;
2276    
2277    
2278        /* ===================================================================*/
2279      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2280      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2281      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1777  for (;; ptr++) Line 2314  for (;; ptr++)
2314        }        }
2315    
2316      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2317      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2318      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2319    
2320      class_charcount = 0;      class_charcount = 0;
2321      class_lastchar = -1;      class_lastchar = -1;
2322    
2323        /* Initialize the 32-char bit map to all zeros. We build the map in a
2324        temporary bit of memory, in case the class contains only 1 character (less
2325        than 256), because in that case the compiled code doesn't use the bit map.
2326        */
2327    
2328        memset(classbits, 0, 32 * sizeof(uschar));
2329    
2330  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2331      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2332      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2333  #endif  #endif
2334    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2335      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2336      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2337      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2338    
2339      do      if (c != 0) do
2340        {        {
2341          const uschar *oldptr;
2342    
2343  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2344        if (utf8 && c > 127)        if (utf8 && c > 127)
2345          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1814  for (;; ptr++) Line 2351  for (;; ptr++)
2351    
2352        if (inescq)        if (inescq)
2353          {          {
2354          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2355            {            {
2356            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2357            ptr++;            ptr++;                            /* Skip the 'E' */
2358            continue;            continue;                         /* Carry on with next */
2359            }            }
2360          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2361          }          }
2362    
2363        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1911  for (;; ptr++) Line 2448  for (;; ptr++)
2448          }          }
2449    
2450        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2451        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2452        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2453        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2454        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2455        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2456    
2457        if (c == '\\')        if (c == '\\')
2458          {          {
2459          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2460            if (*errorcodeptr != 0) goto FAILED;
2461    
2462          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2463          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2464            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2465          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2466            {            {
2467            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1938  for (;; ptr++) Line 2476  for (;; ptr++)
2476            {            {
2477            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2478            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2479            switch (-c)  
2480              /* Save time by not doing this in the pre-compile phase. */
2481    
2482              if (lengthptr == NULL) switch (-c)
2483              {              {
2484              case ESC_d:              case ESC_d:
2485              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1966  for (;; ptr++) Line 2507  for (;; ptr++)
2507              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2508              continue;              continue;
2509    
2510                case ESC_E: /* Perl ignores an orphan \E */
2511                continue;
2512    
2513                default:    /* Not recognized; fall through */
2514                break;      /* Need "default" setting to stop compiler warning. */
2515                }
2516    
2517              /* In the pre-compile phase, just do the recognition. */
2518    
2519              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2520                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2521    
2522              /* We need to deal with \P and \p in both phases. */
2523    
2524  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2525              case ESC_p:            if (-c == ESC_p || -c == ESC_P)
2526              case ESC_P:              {
2527                {              BOOL negated;
2528                BOOL negated;              int pdata;
2529                int pdata;              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2530                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);              if (ptype < 0) goto FAILED;
2531                if (ptype < 0) goto FAILED;              class_utf8 = TRUE;
2532                class_utf8 = TRUE;              *class_utf8data++ = ((-c == ESC_p) != negated)?
2533                *class_utf8data++ = ((-c == ESC_p) != negated)?                XCL_PROP : XCL_NOTPROP;
2534                  XCL_PROP : XCL_NOTPROP;              *class_utf8data++ = ptype;
2535                *class_utf8data++ = ptype;              *class_utf8data++ = pdata;
2536                *class_utf8data++ = pdata;              class_charcount -= 2;   /* Not a < 256 character */
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2537              continue;              continue;
2538                }
2539  #endif  #endif
2540              /* Unrecognized escapes are faulted if PCRE is running in its
2541              strict mode. By default, for compatibility with Perl, they are
2542              treated as literals. */
2543    
2544              /* Unrecognized escapes are faulted if PCRE is running in its            if ((options & PCRE_EXTRA) != 0)
2545              strict mode. By default, for compatibility with Perl, they are              {
2546              treated as literals. */              *errorcodeptr = ERR7;
2547                goto FAILED;
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2548              }              }
2549    
2550              class_charcount -= 2;  /* Undo the default count from above */
2551              c = *ptr;              /* Get the final character and fall through */
2552            }            }
2553    
2554          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
2555          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
2556    
2557          }   /* End of backslash handling */          }   /* End of backslash handling */
2558    
2559        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2560        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2561        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2562          entirely. The code for handling \Q and \E is messy. */
2563    
2564        if (ptr[1] == '-' && ptr[2] != ']')        CHECK_RANGE:
2565          while (ptr[1] == '\\' && ptr[2] == 'E')
2566            {
2567            inescq = FALSE;
2568            ptr += 2;
2569            }
2570    
2571          oldptr = ptr;
2572    
2573          if (!inescq && ptr[1] == '-')
2574          {          {
2575          int d;          int d;
2576          ptr += 2;          ptr += 2;
2577            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2578    
2579            /* If we hit \Q (not followed by \E) at this point, go into escaped
2580            mode. */
2581    
2582            while (*ptr == '\\' && ptr[1] == 'Q')
2583              {
2584              ptr += 2;
2585              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2586              inescq = TRUE;
2587              break;
2588              }
2589    
2590            if (*ptr == 0 || (!inescq && *ptr == ']'))
2591              {
2592              ptr = oldptr;
2593              goto LONE_SINGLE_CHARACTER;
2594              }
2595    
2596  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2597          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 2606  for (;; ptr++)
2606          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2607          in such circumstances. */          in such circumstances. */
2608    
2609          if (d == '\\')          if (!inescq && d == '\\')
2610            {            {
2611            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2613    
2614            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2615            was literal */            special means the '-' was literal */
2616    
2617            if (d < 0)            if (d < 0)
2618              {              {
2619              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2620              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2621                else if (d == -ESC_R) d = 'R'; else
2622                {                {
2623                ptr = oldptr - 2;                ptr = oldptr;
2624                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2625                }                }
2626              }              }
2627            }            }
2628    
2629          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2630          the pre-pass. Optimize one-character ranges */          one-character ranges */
2631    
2632            if (d < c)
2633              {
2634              *errorcodeptr = ERR8;
2635              goto FAILED;
2636              }
2637    
2638          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2639    
# Line 2067  for (;; ptr++) Line 2654  for (;; ptr++)
2654  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2655            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2656              {              {
2657              int occ, ocd;              unsigned int occ, ocd;
2658              int cc = c;              unsigned int cc = c;
2659              int origd = d;              unsigned int origd = d;
2660              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2661                {                {
2662                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2127  for (;; ptr++) Line 2714  for (;; ptr++)
2714          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2715          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2716    
2717          for (; c <= d; c++)          class_charcount += d - c + 1;
2718            class_lastchar = d;
2719    
2720            /* We can save a bit of time by skipping this in the pre-compile. */
2721    
2722            if (lengthptr == NULL) for (; c <= d; c++)
2723            {            {
2724            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2725            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 2727  for (;; ptr++)
2727              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2728              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2729              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2730            }            }
2731    
2732          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 2750  for (;; ptr++)
2750  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2751          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2752            {            {
2753            int othercase;            unsigned int othercase;
2754            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2755              {              {
2756              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2757              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 2776  for (;; ptr++)
2776          }          }
2777        }        }
2778    
2779      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2780    
2781      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2782    
2783        if (c == 0)                          /* Missing terminating ']' */
2784          {
2785          *errorcodeptr = ERR6;
2786          goto FAILED;
2787          }
2788    
2789      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2790      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2253  for (;; ptr++) Line 2848  for (;; ptr++)
2848    
2849      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2850      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2851      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2852    
2853  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2854      if (class_utf8)      if (class_utf8)
# Line 2263  for (;; ptr++) Line 2858  for (;; ptr++)
2858        code += LINK_SIZE;        code += LINK_SIZE;
2859        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2860    
2861        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2862        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2863    
2864        if (class_charcount > 0)        if (class_charcount > 0)
2865          {          {
2866          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2867            memmove(code + 32, code, class_utf8data - code);
2868          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2869          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2870          }          }
2871          else code = class_utf8data;
2872    
2873        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2874    
# Line 2297  for (;; ptr++) Line 2885  for (;; ptr++)
2885      if (negate_class)      if (negate_class)
2886        {        {
2887        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2888        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2889            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2890        }        }
2891      else      else
2892        {        {
# Line 2307  for (;; ptr++) Line 2896  for (;; ptr++)
2896      code += 32;      code += 32;
2897      break;      break;
2898    
2899    
2900        /* ===================================================================*/
2901      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2902      has been tested above. */      has been tested above. */
2903    
# Line 2374  for (;; ptr++) Line 2965  for (;; ptr++)
2965        }        }
2966      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2967    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2968      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2969      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2970      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 2998  for (;; ptr++)
2998          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2999          }          }
3000    
3001          /* If the repetition is unlimited, it pays to see if the next thing on
3002          the line is something that cannot possibly match this character. If so,
3003          automatically possessifying this item gains some performance in the case
3004          where the match fails. */
3005    
3006          if (!possessive_quantifier &&
3007              repeat_max < 0 &&
3008              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3009                options, cd))
3010            {
3011            repeat_type = 0;    /* Force greedy */
3012            possessive_quantifier = TRUE;
3013            }
3014    
3015        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3016        }        }
3017    
3018      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3019      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3020      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3021      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3022        currently used only for single-byte chars. */
3023    
3024      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3025        {        {
3026        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3027        c = previous[1];        c = previous[1];
3028          if (!possessive_quantifier &&
3029              repeat_max < 0 &&
3030              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3031            {
3032            repeat_type = 0;    /* Force greedy */
3033            possessive_quantifier = TRUE;
3034            }
3035        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3036        }        }
3037    
# Line 2450  for (;; ptr++) Line 3049  for (;; ptr++)
3049        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3050        c = *previous;        c = *previous;
3051    
3052          if (!possessive_quantifier &&
3053              repeat_max < 0 &&
3054              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3055            {
3056            repeat_type = 0;    /* Force greedy */
3057            possessive_quantifier = TRUE;
3058            }
3059    
3060        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3061        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3062          {          {
# Line 2490  for (;; ptr++) Line 3097  for (;; ptr++)
3097          }          }
3098    
3099        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3100        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3101        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3102        one less than the maximum. */        one less than the maximum. */
3103    
# Line 2543  for (;; ptr++) Line 3150  for (;; ptr++)
3150            }            }
3151    
3152          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3153          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3154            UPTO is just for 1 instance, we can use QUERY instead. */
3155    
3156          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3157            {            {
# Line 2562  for (;; ptr++) Line 3170  for (;; ptr++)
3170              *code++ = prop_value;              *code++ = prop_value;
3171              }              }
3172            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3173            *code++ = OP_UPTO + repeat_type;  
3174            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3175                {
3176                *code++ = OP_QUERY + repeat_type;
3177                }
3178              else
3179                {
3180                *code++ = OP_UPTO + repeat_type;
3181                PUT2INC(code, 0, repeat_max);
3182                }
3183            }            }
3184          }          }
3185    
# Line 2630  for (;; ptr++) Line 3246  for (;; ptr++)
3246      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3247      cases. */      cases. */
3248    
3249      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3250               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3251        {        {
3252        register int i;        register int i;
3253        int ketoffset = 0;        int ketoffset = 0;
3254        int len = code - previous;        int len = code - previous;
3255        uschar *bralink = NULL;        uschar *bralink = NULL;
3256    
3257          /* Repeating a DEFINE group is pointless */
3258    
3259          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3260            {
3261            *errorcodeptr = ERR55;
3262            goto FAILED;
3263            }
3264    
3265          /* This is a paranoid check to stop integer overflow later on */
3266    
3267          if (len > MAX_DUPLENGTH)
3268            {
3269            *errorcodeptr = ERR50;
3270            goto FAILED;
3271            }
3272    
3273        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3274        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3275        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2672  for (;; ptr++) Line 3304  for (;; ptr++)
3304          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3305          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3306          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3307          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3308          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3309            doing this. */
3310    
3311          if (repeat_max <= 1)          if (repeat_max <= 1)
3312            {            {
3313            *code = OP_END;            *code = OP_END;
3314            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3315            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3316            code++;            code++;
3317            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2696  for (;; ptr++) Line 3329  for (;; ptr++)
3329            {            {
3330            int offset;            int offset;
3331            *code = OP_END;            *code = OP_END;
3332            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3333            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3334            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3335            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3349  for (;; ptr++)
3349        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3350        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3351        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3352        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3353          forward reference subroutine calls in the group, there will be entries on
3354          the workspace list; replicate these with an appropriate increment. */
3355    
3356        else        else
3357          {          {
3358          if (repeat_min > 1)          if (repeat_min > 1)
3359            {            {
3360            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3361            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3362    
3363              if (lengthptr != NULL)
3364                *lengthptr += (repeat_min - 1)*length_prevgroup;
3365    
3366              /* This is compiling for real */
3367    
3368              else
3369              {              {
3370              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3371              code += len;              for (i = 1; i < repeat_min; i++)
3372                  {
3373                  uschar *hc;
3374                  uschar *this_hwm = cd->hwm;
3375                  memcpy(code, previous, len);
3376                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3377                    {
3378                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3379                    cd->hwm += LINK_SIZE;
3380                    }
3381                  save_hwm = this_hwm;
3382                  code += len;
3383                  }
3384              }              }
3385            }            }
3386    
3387          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3388          }          }
3389    
# Line 2736  for (;; ptr++) Line 3391  for (;; ptr++)
3391        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3392        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3393        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3394        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3395          replicate entries on the forward reference list. */
3396    
3397        if (repeat_max >= 0)        if (repeat_max >= 0)
3398          {          {
3399          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3400            just adjust the length as if we had. For each repetition we must add 1
3401            to the length for BRAZERO and for all but the last repetition we must
3402            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3403    
3404            if (lengthptr != NULL && repeat_max > 0)
3405              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3406                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3407    
3408            /* This is compiling for real */
3409    
3410            else for (i = repeat_max - 1; i >= 0; i--)
3411            {            {
3412              uschar *hc;
3413              uschar *this_hwm = cd->hwm;
3414    
3415            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3416    
3417            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 3427  for (;; ptr++)
3427              }              }
3428    
3429            memcpy(code, previous, len);            memcpy(code, previous, len);
3430              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3431                {
3432                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3433                cd->hwm += LINK_SIZE;
3434                }
3435              save_hwm = this_hwm;
3436            code += len;            code += len;
3437            }            }
3438    
# Line 2779  for (;; ptr++) Line 3455  for (;; ptr++)
3455        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3456        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3457        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3458        correct offset was computed above. */        correct offset was computed above.
3459    
3460        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
3461          this group is a non-atomic one that could match an empty string. If so,
3462          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3463          that runtime checking can be done. [This check is also applied to
3464          atomic groups at runtime, but in a different way.] */
3465    
3466          else
3467            {
3468            uschar *ketcode = code - ketoffset;
3469            uschar *bracode = ketcode - GET(ketcode, 1);
3470            *ketcode = OP_KETRMAX + repeat_type;
3471            if (lengthptr == NULL && *bracode != OP_ONCE)
3472              {
3473              uschar *scode = bracode;
3474              do
3475                {
3476                if (could_be_empty_branch(scode, ketcode, utf8))
3477                  {
3478                  *bracode += OP_SBRA - OP_BRA;
3479                  break;
3480                  }
3481                scode += GET(scode, 1);
3482                }
3483              while (*scode == OP_ALT);
3484              }
3485            }
3486        }        }
3487    
3488      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2792  for (;; ptr++) Line 3493  for (;; ptr++)
3493        goto FAILED;        goto FAILED;
3494        }        }
3495    
3496      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3497      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3498      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3499      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3500      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3501        but the special opcodes can optimize it a bit. The repeated item starts at
3502        tempcode, not at previous, which might be the first part of a string whose
3503        (former) last char we repeated.
3504    
3505        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3506        an 'upto' may follow. We skip over an 'exact' item, and then test the
3507        length of what remains before proceeding. */
3508    
3509      if (possessive_quantifier)      if (possessive_quantifier)
3510        {        {
3511        int len = code - tempcode;        int len;
3512        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3513        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3514        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3515        tempcode[0] = OP_ONCE;        len = code - tempcode;
3516        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3517        PUTINC(code, 0, len);          {
3518        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3519            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3520            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3521            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3522    
3523            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3524            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3525            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3526            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3527    
3528            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3529            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3530            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3531            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3532    
3533            default:
3534            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3535            code += 1 + LINK_SIZE;
3536            len += 1 + LINK_SIZE;
3537            tempcode[0] = OP_ONCE;
3538            *code++ = OP_KET;
3539            PUTINC(code, 0, len);
3540            PUT(tempcode, 1, len);
3541            break;
3542            }
3543        }        }
3544    
3545      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 3552  for (;; ptr++)
3552      break;      break;
3553    
3554    
3555      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3556      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3557      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3558      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3559      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3560      check for syntax errors here.  */      group. */
3561    
3562      case '(':      case '(':
3563      newoptions = options;      newoptions = options;
3564      skipbytes = 0;      skipbytes = 0;
3565        bravalue = OP_CBRA;
3566        save_hwm = cd->hwm;
3567    
3568      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3569        {        {
3570        int set, unset;        int i, set, unset, namelen;
3571        int *optset;        int *optset;
3572          const uschar *name;
3573          uschar *slot;
3574    
3575        switch (*(++ptr))        switch (*(++ptr))
3576          {          {
3577          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3578          ptr++;          ptr++;
3579          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3580            if (*ptr == 0)
3581              {
3582              *errorcodeptr = ERR18;
3583              goto FAILED;
3584              }
3585          continue;          continue;
3586    
3587          case ':':                 /* Non-extracting bracket */  
3588            /* ------------------------------------------------------------ */
3589            case ':':                 /* Non-capturing bracket */
3590          bravalue = OP_BRA;          bravalue = OP_BRA;
3591          ptr++;          ptr++;
3592          break;          break;
3593    
3594    
3595            /* ------------------------------------------------------------ */
3596          case '(':          case '(':
3597          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3598    
3599          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3600            group), a name (referring to a named group), or 'R', referring to
3601            recursion. R<digits> and R&name are also permitted for recursion tests.
3602    
3603            There are several syntaxes for testing a named group: (?(name)) is used
3604            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3605    
3606            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3607            be the recursive thing or the name 'R' (and similarly for 'R' followed
3608            by digits), and (b) a number could be a name that consists of digits.
3609            In both cases, we look for a name first; if not found, we try the other
3610            cases. */
3611    
3612            /* For conditions that are assertions, check the syntax, and then exit
3613            the switch. This will take control down to where bracketed groups,
3614            including assertions, are processed. */
3615    
3616            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3617              break;
3618    
3619            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3620            below), and all need to skip 3 bytes at the start of the group. */
3621    
3622            code[1+LINK_SIZE] = OP_CREF;
3623            skipbytes = 3;
3624    
3625            /* Check for a test for recursion in a named group. */
3626    
3627            if (ptr[1] == 'R' && ptr[2] == '&')
3628              {
3629              terminator = -1;
3630              ptr += 2;
3631              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3632              }
3633    
3634            /* Check for a test for a named group's having been set, using the Perl
3635            syntax (?(<name>) or (?('name') */
3636    
3637            else if (ptr[1] == '<')
3638              {
3639              terminator = '>';
3640              ptr++;
3641              }
3642            else if (ptr[1] == '\'')
3643              {
3644              terminator = '\'';
3645              ptr++;
3646              }
3647            else terminator = 0;
3648    
3649            /* We now expect to read a name; any thing else is an error */
3650    
3651            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3652              {
3653              ptr += 1;  /* To get the right offset */
3654              *errorcodeptr = ERR28;
3655              goto FAILED;
3656              }
3657    
3658            /* Read the name, but also get it as a number if it's all digits */
3659    
3660            recno = 0;
3661            name = ++ptr;
3662            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3663              {
3664              if (recno >= 0)
3665                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3666                  recno * 10 + *ptr - '0' : -1;
3667              ptr++;
3668              }
3669            namelen = ptr - name;
3670    
3671            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3672              {
3673              ptr--;      /* Error offset */
3674              *errorcodeptr = ERR26;
3675              goto FAILED;
3676              }
3677    
3678            /* Do no further checking in the pre-compile phase. */
3679    
3680            if (lengthptr != NULL) break;
3681    
3682            /* In the real compile we do the work of looking for the actual
3683            reference. */
3684    
3685            slot = cd->name_table;
3686            for (i = 0; i < cd->names_found; i++)
3687              {
3688              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3689              slot += cd->name_entry_size;
3690              }
3691    
3692            /* Found a previous named subpattern */
3693    
3694            if (i < cd->names_found)
3695              {
3696              recno = GET2(slot, 0);
3697              PUT2(code, 2+LINK_SIZE, recno);
3698              }
3699    
3700            /* Search the pattern for a forward reference */
3701    
3702            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3703                            (options & PCRE_EXTENDED) != 0)) > 0)
3704              {
3705              PUT2(code, 2+LINK_SIZE, i);
3706              }
3707    
3708          if (ptr[1] == 'R')          /* If terminator == 0 it means that the name followed directly after
3709            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3710            some further alternatives to try. For the cases where terminator != 0
3711            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3712            now checked all the possibilities, so give an error. */
3713    
3714            else if (terminator != 0)
3715            {            {
3716            code[1+LINK_SIZE] = OP_CREF;            *errorcodeptr = ERR15;
3717            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            goto FAILED;
           skipbytes = 3;  
           ptr += 3;  
3718            }            }
3719    
3720          /* Condition to test for a numbered subpattern match. We know that          /* Check for (?(R) for recursion. Allow digits after R to specify a
3721          if a digit follows ( then there will just be digits until ) because          specific group number. */
         the syntax was checked in the first pass. */  
3722    
3723          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (*name == 'R')
3724            {            {
3725            int condref;                 /* Don't amalgamate; some compilers */            recno = 0;
3726            condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */            for (i = 1; i < namelen; i++)
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3727              {              {
3728              *errorcodeptr = ERR35;              if ((digitab[name[i]] & ctype_digit) == 0)
3729              goto FAILED;                {
3730                  *errorcodeptr = ERR15;
3731                  goto FAILED;
3732                  }
3733                recno = recno * 10 + name[i] - '0';
3734              }              }
3735            ptr++;            if (recno == 0) recno = RREF_ANY;
3736            code[1+LINK_SIZE] = OP_CREF;            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3737            PUT2(code, 2+LINK_SIZE, condref);            PUT2(code, 2+LINK_SIZE, recno);
3738            skipbytes = 3;            }
3739    
3740            /* Similarly, check for the (?(DEFINE) "condition", which is always
3741            false. */
3742    
3743            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3744              {
3745              code[1+LINK_SIZE] = OP_DEF;
3746              skipbytes = 1;
3747              }
3748    
3749            /* Check for the "name" actually being a subpattern number. */
3750    
3751            else if (recno > 0)
3752              {
3753              PUT2(code, 2+LINK_SIZE, recno);
3754              }
3755    
3756            /* Either an unidentified subpattern, or a reference to (?(0) */
3757    
3758            else
3759              {
3760              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3761              goto FAILED;
3762            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
3763          break;          break;
3764    
3765    
3766            /* ------------------------------------------------------------ */
3767          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3768          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3769          ptr++;          ptr++;
3770          break;          break;
3771    
3772    
3773            /* ------------------------------------------------------------ */
3774          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3775          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3776          ptr++;          ptr++;
3777          break;          break;
3778    
3779          case '<':                 /* Lookbehinds */  
3780          switch (*(++ptr))          /* ------------------------------------------------------------ */
3781            case '<':                 /* Lookbehind or named define */
3782            switch (ptr[1])
3783            {            {
3784            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3785            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3786            ptr++;            ptr += 2;
3787            break;            break;
3788    
3789            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3790            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3791            ptr++;            ptr += 2;
3792            break;            break;
3793    
3794              default:                /* Could be name define, else bad */
3795              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3796              ptr++;                  /* Correct offset for error */
3797              *errorcodeptr = ERR24;
3798              goto FAILED;
3799            }            }
3800          break;          break;
3801    
3802    
3803            /* ------------------------------------------------------------ */
3804          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3805          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3806          ptr++;          ptr++;
3807          break;          break;
3808    
3809    
3810            /* ------------------------------------------------------------ */
3811          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3812          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3813          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3814          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3815            {                       /* closing parenthesis is present. */            {
3816            int n = 0;            int n = 0;
3817            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3818              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3819              if (*ptr != ')')
3820                {
3821                *errorcodeptr = ERR39;
3822                goto FAILED;
3823                }
3824            if (n > 255)            if (n > 255)
3825              {              {
3826              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2935  for (;; ptr++) Line 3834  for (;; ptr++)
3834          previous = NULL;          previous = NULL;
3835          continue;          continue;
3836    
3837          case 'P':                 /* Named subpattern handling */  
3838          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3839            case 'P':                 /* Python-style named subpattern handling */
3840            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3841              {
3842              is_recurse = *ptr == '>';
3843              terminator = ')';
3844              goto NAMED_REF_OR_RECURSE;
3845              }
3846            else if (*ptr != '<')    /* Test for Python-style definition */
3847            {            {
3848            int i, namelen;            *errorcodeptr = ERR41;
3849            uschar *slot = cd->name_table;            goto FAILED;
3850            const uschar *name;     /* Don't amalgamate; some compilers */            }
3851            name = ++ptr;           /* grumble at autoincrement in declaration */          /* Fall through to handle (?P< as (?< is handled */
3852    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
3853    
3854            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
3855              {          DEFINE_NAME:    /* Come here from (?< handling */
3856              int crc = memcmp(name, slot+2, namelen);          case '\'':
3857              if (crc == 0)            {
3858                {            terminator = (*ptr == '<')? '>' : '\'';
3859                if (slot[2+namelen] == 0)            name = ++ptr;
3860                  {  
3861                  *errorcodeptr = ERR43;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3862                  goto FAILED;            namelen = ptr - name;
3863                  }  
3864                crc = -1;             /* Current name is substring */            /* In the pre-compile phase, just do a syntax check. */
3865    
3866              if (lengthptr != NULL)
3867                {
3868                if (*ptr != terminator)
3869                  {
3870                  *errorcodeptr = ERR42;
3871                  goto FAILED;
3872                }                }
3873              if (crc < 0)              if (cd->names_found >= MAX_NAME_COUNT)
3874                {                {
3875                memmove(slot + cd->name_entry_size, slot,                *errorcodeptr = ERR49;
3876                  (cd->names_found - i) * cd->name_entry_size);                goto FAILED;
3877                break;                }
3878                if (namelen + 3 > cd->name_entry_size)
3879                  {
3880                  cd->name_entry_size = namelen + 3;
3881                  if (namelen > MAX_NAME_SIZE)
3882                    {
3883                    *errorcodeptr = ERR48;
3884                    goto FAILED;
3885                    }
3886                }                }
             slot += cd->name_entry_size;  
3887              }              }
3888    
3889            PUT2(slot, 0, *brackets + 1);            /* In the real compile, create the entry in the table */
3890            memcpy(slot + 2, name, namelen);  
3891            slot[2+namelen] = 0;            else
3892            cd->names_found++;              {
3893            goto NUMBERED_GROUP;              slot = cd->name_table;
3894                for (i = 0; i < cd->names_found; i++)
3895                  {
3896                  int crc = memcmp(name, slot+2, namelen);
3897                  if (crc == 0)
3898                    {
3899                    if (slot[2+namelen] == 0)
3900                      {
3901                      if ((options & PCRE_DUPNAMES) == 0)
3902                        {
3903                        *errorcodeptr = ERR43;
3904                        goto FAILED;
3905                        }
3906                      }
3907                    else crc = -1;      /* Current name is substring */
3908                    }
3909                  if (crc < 0)
3910                    {
3911                    memmove(slot + cd->name_entry_size, slot,
3912                      (cd->names_found - i) * cd->name_entry_size);
3913                    break;
3914                    }
3915                  slot += cd->name_entry_size;
3916                  }
3917    
3918                PUT2(slot, 0, cd->bracount + 1);
3919                memcpy(slot + 2, name, namelen);
3920                slot[2+namelen] = 0;
3921                }
3922            }            }
3923    
3924          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
3925    
3926            ptr++;                    /* Move past > or ' */
3927            cd->names_found++;
3928            goto NUMBERED_GROUP;
3929    
3930    
3931            /* ------------------------------------------------------------ */
3932            case '&':                 /* Perl recursion/subroutine syntax */
3933            terminator = ')';
3934            is_recurse = TRUE;
3935            /* Fall through */
3936    
3937            /* We come here from the Python syntax above that handles both
3938            references (?P=name) and recursion (?P>name), as well as falling
3939            through from the Perl recursion syntax (?&name). */
3940    
3941            NAMED_REF_OR_RECURSE:
3942            name = ++ptr;
3943            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3944            namelen = ptr - name;
3945    
3946            /* In the pre-compile phase, do a syntax check and set a dummy
3947            reference number. */
3948    
3949            if (lengthptr != NULL)
3950            {            {
3951            int i, namelen;            if (*ptr != terminator)
3952            int type = *ptr++;              {
3953            const uschar *name = ptr;              *errorcodeptr = ERR42;
3954            uschar *slot = cd->name_table;              goto FAILED;
3955                }
3956              if (namelen > MAX_NAME_SIZE)
3957                {
3958                *errorcodeptr = ERR48;
3959                goto FAILED;
3960                }
3961              recno = 0;
3962              }
3963    
3964            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
3965    
3966            else
3967              {
3968              slot = cd->name_table;
3969            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
3970              {              {
3971              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3972              slot += cd->name_entry_size;              slot += cd->name_entry_size;
3973              }              }
3974            if (i >= cd->names_found)  
3975              if (i < cd->names_found)         /* Back reference */
3976                {
3977                recno = GET2(slot, 0);
3978                }
3979              else if ((recno =                /* Forward back reference */
3980                        find_parens(ptr, cd->bracount, name, namelen,
3981                          (options & PCRE_EXTENDED) != 0)) <= 0)
3982              {              {
3983              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
3984              goto FAILED;              goto FAILED;
3985              }              }
3986              }
3987    
3988            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
3989            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
3990    
3991            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
3992            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
3993    
         /* Should never happen */  
         break;  
3994    
3995          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
3996            case 'R':                 /* Recursion */
3997          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
3998          /* Fall through */          /* Fall through */
3999    
         /* Recursion or "subroutine" call */  
4000    
4001          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4002          case '5': case '6': case '7': case '8': case '9':          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4003            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4004            {            {
4005            const uschar *called;            const uschar *called;
4006            recno = 0;            recno = 0;
4007            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4008              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4009              if (*ptr != ')')
4010                {
4011                *errorcodeptr = ERR29;
4012                goto FAILED;
4013                }
4014    
4015            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4016    
4017            HANDLE_RECURSION:            HANDLE_RECURSION:
4018    
4019            previous = code;            previous = code;
4020              called = cd->start_code;
4021    
4022            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4023            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4024              this point. If we end up with a forward reference, first check that
4025            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4026            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4027              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4028    
4029            if (called == NULL)            if (lengthptr == NULL)
4030              {              {
4031              *errorcodeptr = ERR15;              *code = OP_END;
4032              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4033    
4034            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4035    
4036            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4037              {                {
4038              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4039              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4040                    {
4041                    *errorcodeptr = ERR15;
4042                    goto FAILED;
4043                    }
4044                  called = cd->start_code + recno;
4045                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4046                  }
4047    
4048                /* If not a forward reference, and the subpattern is still open,
4049                this is a recursive call. We check to see if this is a left
4050                recursion that could loop for ever, and diagnose that case. */
4051    
4052                else if (GET(called, 1) == 0 &&
4053                         could_be_empty(called, code, bcptr, utf8))
4054                  {
4055                  *errorcodeptr = ERR40;
4056                  goto FAILED;
4057                  }
4058              }              }
4059    
4060            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4061            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4062              subsequent quantifier will work. */
4063    
4064            *code = OP_ONCE;            *code = OP_ONCE;
4065            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3069  for (;; ptr++) Line 4072  for (;; ptr++)
4072            *code = OP_KET;            *code = OP_KET;
4073            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4074            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4075    
4076              length_prevgroup = 3 + 3*LINK_SIZE;
4077            }            }
4078    
4079            /* Can't determine a first byte now */
4080    
4081            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4082          continue;          continue;
4083    
         /* Character after (? not specially recognized */  
4084    
4085          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4086            default:              /* Other characters: check option setting */
4087          set = unset = 0;          set = unset = 0;
4088          optset = &set;          optset = &set;
4089    
# Line 3084  for (;; ptr++) Line 4093  for (;; ptr++)
4093              {              {
4094              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4095    
4096                case 'J':    /* Record that it changed in the external options */
4097                *optset |= PCRE_DUPNAMES;
4098                cd->external_options |= PCRE_JCHANGED;
4099                break;
4100    
4101              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4102              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4103              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4104              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4105              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4106              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4107    
4108                default:  *errorcodeptr = ERR12;
4109                          ptr--;    /* Correct the offset */
4110                          goto FAILED;
4111              }              }
4112            }            }
4113    
# Line 3098  for (;; ptr++) Line 4116  for (;; ptr++)
4116          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4117    
4118          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4119          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4120          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4121          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4122          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4123          a group), a resetting item can be compiled.          caseless checking of required bytes.
4124    
4125          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4126          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4127          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4128            that value after the start, because it gets reset as code is discarded
4129            during the pre-compile. However, this can happen only at top level - if
4130            we are within parentheses, the starting BRA will still be present. At
4131            any parenthesis level, the length value can be used to test if anything
4132            has been compiled at that level. Thus, a test for both these conditions
4133            is necessary to ensure we correctly detect the start of the pattern in
4134            both phases.
4135    
4136            If we are not at the pattern start, compile code to change the ims
4137            options if this setting actually changes any of them. We also pass the
4138            new setting back so that it can be put at the start of any following
4139            branches, and when this group ends (if we are in a group), a resetting
4140            item can be compiled. */
4141    
4142          if (*ptr == ')')          if (*ptr == ')')
4143            {            {
4144            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4145                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4146              {              {
4147              *code++ = OP_OPT;              cd->external_options = newoptions;
4148              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4149              }              }
4150             else
4151                {
4152                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4153                  {
4154                  *code++ = OP_OPT;
4155                  *code++ = newoptions & PCRE_IMS;
4156                  }
4157    
4158            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4159            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4160            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4161    
4162            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4163            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4164            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4165            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4166                }
4167    
4168            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4169            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3136  for (;; ptr++) Line 4176  for (;; ptr++)
4176    
4177          bravalue = OP_BRA;          bravalue = OP_BRA;
4178          ptr++;          ptr++;
4179          }          }     /* End of switch for character following (? */
4180        }        }       /* End of (? handling */
4181    
4182      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4183      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4184        brackets. */
4185    
4186      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4187        {        {
4188        bravalue = OP_BRA;        bravalue = OP_BRA;
4189        }        }
4190    
4191      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4192    
4193      else      else
4194        {        {
4195        NUMBERED_GROUP:        NUMBERED_GROUP:
4196        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4197          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4198          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4199        }        }
4200    
4201      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4202      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4203      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4204      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4205        they have changed. */
4206    
4207      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4208      *code = bravalue;      *code = bravalue;
4209      tempcode = code;      tempcode = code;
4210      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4211        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4212    
4213      if (!compile_regex(      if (!compile_regex(
4214           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4215           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4216           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4217           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4218           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4219           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4220            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4221           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4222           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4223           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4224           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4225           cd))                          /* Tables block */           cd,                           /* Tables block */
4226             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4227               &length_prevgroup           /* Pre-compile phase */
4228             ))
4229        goto FAILED;        goto FAILED;
4230    
4231      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3196  for (;; ptr++) Line 4234  for (;; ptr++)
4234      is on the bracket. */      is on the bracket. */
4235    
4236      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4237      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4238    
4239      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4240        {        {
4241        uschar *tc = code;        uschar *tc = code;
4242        condcount = 0;        int condcount = 0;
4243    
4244        do {        do {
4245           condcount++;           condcount++;
# Line 3209  for (;; ptr++) Line 4247  for (;; ptr++)
4247           }           }
4248        while (*tc != OP_KET);        while (*tc != OP_KET);
4249    
4250        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4251          false). It must have only one branch. */
4252    
4253          if (code[LINK_SIZE+1] == OP_DEF)
4254          {          {
4255          *errorcodeptr = ERR27;          if (condcount > 1)
4256          goto FAILED;            {
4257              *errorcodeptr = ERR54;
4258              goto FAILED;
4259              }
4260            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4261            }
4262    
4263          /* A "normal" conditional group. If there is just one branch, we must not
4264          make use of its firstbyte or reqbyte, because this is equivalent to an
4265          empty second branch. */
4266    
4267          else
4268            {
4269            if (condcount > 2)
4270              {
4271              *errorcodeptr = ERR27;
4272              goto FAILED;
4273              }
4274            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4275          }          }
4276          }
4277    
4278        /* Error if hit end of pattern */
4279    
4280        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4281        reqbyte, because this is equivalent to an empty second branch. */        {
4282          *errorcodeptr = ERR14;
4283          goto FAILED;
4284          }
4285    
4286        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4287        group, less the brackets at either end. Then reduce the compiled code to
4288        just the brackets so that it doesn't use much memory if it is duplicated by
4289        a quantifier. */
4290    
4291        if (lengthptr != NULL)
4292          {
4293          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4294          code++;
4295          PUTINC(code, 0, 1 + LINK_SIZE);
4296          *code++ = OP_KET;
4297          PUTINC(code, 0, 1 + LINK_SIZE);
4298        }        }
4299    
4300      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4301      brackets of all kinds, and conditions with two branches (see code above).  
4302      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4303      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4304      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4305        relevant. */
4306    
4307        if (bravalue == OP_DEF) break;
4308    
4309        /* Handle updating of the required and first characters for other types of
4310        group. Update for normal brackets of all kinds, and conditions with two
4311        branches (see code above). If the bracket is followed by a quantifier with
4312        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4313        zerofirstbyte outside the main loop so that they can be accessed for the
4314        back off. */
4315    
4316      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4317      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4318      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4319    
4320      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4321        {        {
4322        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4323        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3272  for (;; ptr++) Line 4358  for (;; ptr++)
4358      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4359    
4360      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4361        break;     /* End of processing '(' */
4362    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
4363    
4364      case '\\':      /* ===================================================================*/
4365      tempptr = ptr;      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
   
     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values  
4366      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4367      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4368      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4369      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4370      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4371    
4372        case '\\':
4373        tempptr = ptr;
4374        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4375        if (*errorcodeptr != 0) goto FAILED;
4376    
4377      if (c < 0)      if (c < 0)
4378        {        {
4379        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3310  for (;; ptr++) Line 4383  for (;; ptr++)
4383          continue;          continue;
4384          }          }
4385    
4386          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4387    
4388        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4389        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4390    
# Line 3321  for (;; ptr++) Line 4396  for (;; ptr++)
4396        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4397        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4398    
4399        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4400    
4401          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4402            {
4403            is_recurse = FALSE;
4404            terminator = (*(++ptr) == '<')? '>' : '\'';
4405            goto NAMED_REF_OR_RECURSE;
4406            }
4407    
4408          /* Back references are handled specially; must disable firstbyte if
4409          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4410          ':' later. */
4411    
4412        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4413          {          {
4414          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4415    
4416            HANDLE_REFERENCE:    /* Come here from named backref handling */
4417            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4418          previous = code;          previous = code;
4419          *code++ = OP_REF;          *code++ = OP_REF;
4420          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4421            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4422            if (recno > cd->top_backref) cd->top_backref = recno;
4423          }          }
4424    
4425        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4426    
4427  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4428        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3340  for (;; ptr++) Line 4430  for (;; ptr++)
4430          BOOL negated;          BOOL negated;
4431          int pdata;          int pdata;
4432          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4433            if (ptype < 0) goto FAILED;
4434          previous = code;          previous = code;
4435          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4436          *code++ = ptype;          *code++ = ptype;
4437          *code++ = pdata;          *code++ = pdata;
4438          }          }
4439    #else
4440    
4441          /* If Unicode properties are not supported, \X, \P, and \p are not
4442          allowed. */
4443    
4444          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4445            {
4446            *errorcodeptr = ERR45;
4447            goto FAILED;
4448            }
4449  #endif  #endif
4450    
4451        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4452        value */        can obtain the OP value by negating the escape value. */
4453    
4454        else        else
4455          {          {
# Line 3372  for (;; ptr++) Line 4473  for (;; ptr++)
4473       mcbuffer[0] = c;       mcbuffer[0] = c;
4474       mclength = 1;       mclength = 1;
4475       }       }
   
4476      goto ONE_CHAR;      goto ONE_CHAR;
4477    
4478    
4479        /* ===================================================================*/
4480      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4481      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4482      multi-byte literal character. */      multi-byte literal character. */
# Line 3385  for (;; ptr++) Line 4487  for (;; ptr++)
4487      mcbuffer[0] = c;      mcbuffer[0] = c;
4488    
4489  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4490      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4491        {        {
4492        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4493          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3436  for (;; ptr++) Line 4538  for (;; ptr++)
4538      }      }
4539    }                   /* end of big loop */    }                   /* end of big loop */
4540    
4541    
4542  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4543  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4544  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3452  return FALSE; Line 4555  return FALSE;
4555  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4556  *************************************************/  *************************************************/
4557    
4558  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4559  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4560  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4561  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4562  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4563  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4564  the new options into every subsequent branch compile.  into every subsequent branch compile.
4565    
4566    This function is used during the pre-compile phase when we are trying to find
4567    out the amount of memory needed, as well as during the real compile phase. The
4568    value of lengthptr distinguishes the two phases.
4569    
4570  Argument:  Argument:
4571    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4572    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4573    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4574    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4575    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4576    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4577    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4578    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4579    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4580    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4581    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4582      lengthptr      NULL during the real compile phase
4583                     points to length accumulator during pre-compile phase
4584    
4585  Returns:      TRUE on success  Returns:         TRUE on success
4586  */  */
4587    
4588  static BOOL  static BOOL
4589  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4590    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4591    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4592  {  {
4593  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4594  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3489  uschar *start_bracket = code; Line 4597  uschar *start_bracket = code;
4597  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4598  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4599  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4600    int length;
4601  branch_chain bc;  branch_chain bc;
4602    
4603  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3496  bc.current = code; Line 4605  bc.current = code;
4605    
4606  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4607    
4608    /* Accumulate the length for use in the pre-compile phase. Start with the
4609    length of the BRA and KET and any extra bytes that are required at the
4610    beginning. We accumulate in a local variable to save frequent testing of
4611    lenthptr for NULL. We cannot do this by looking at the value of code at the
4612    start and end of each alternative, because compiled items are discarded during
4613    the pre-compile phase so that the work space is not exceeded. */
4614    
4615    length = 2 + 2*LINK_SIZE + skipbytes;
4616    
4617    /* WARNING: If the above line is changed for any reason, you must also change
4618    the code that abstracts option settings at the start of the pattern and makes
4619    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4620    pre-compile phase to find out whether anything has yet been compiled or not. */
4621    
4622  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4623    
4624  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3511  for (;;) Line 4634  for (;;)
4634      {      {
4635      *code++ = OP_OPT;      *code++ = OP_OPT;
4636      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4637        length += 2;
4638      }      }
4639    
4640    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3520  for (;;) Line 4644  for (;;)
4644      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4645      reverse_count = code;      reverse_count = code;
4646      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4647        length += 1 + LINK_SIZE;
4648      }      }
4649    
4650    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4651      into the length. */
4652    
4653    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4654          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4655      {      {
4656      *ptrptr = ptr;      *ptrptr = ptr;
4657      return FALSE;      return FALSE;
4658      }      }
4659    
4660    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4661    
4662    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4663      {      {
4664      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4665      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4666    
4667    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4668    match the values from all the previous branches, except that if the previous        {
4669    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4670    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4671          }
4672    
4673    else      /* If this is not the first branch, the first char and reqbyte have to
4674      {      match the values from all the previous branches, except that if the
4675      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4676      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4677    
4678      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4679        {        {
4680        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4681        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4682        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4683    
4684          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4685            {
4686            if (reqbyte < 0) reqbyte = firstbyte;
4687            firstbyte = REQ_NONE;
4688            }
4689    
4690      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4691      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4692    
4693      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4694          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4695    
4696      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4697    
4698      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4699        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4700      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4701      }        }
4702    
4703    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4704    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4705    the branch with OP_END. */      branch with OP_END. */
4706    
4707    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4708        {        {
4709        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4710        *ptrptr = ptr;        *code = OP_END;
4711        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4712          DPRINTF(("fixed length = %d\n", fixed_length));
4713          if (fixed_length < 0)
4714            {
4715            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4716            *ptrptr = ptr;
4717            return FALSE;
4718            }
4719          PUT(reverse_count, 0, fixed_length);
4720        }        }
     PUT(reverse_count, 0, length);  
4721      }      }
4722    
4723    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. Go back through
# Line 3600  for (;;) Line 4731  for (;;)
4731    
4732    if (*ptr != '|')    if (*ptr != '|')
4733      {      {
4734      int length = code - last_branch;      int branch_length = code - last_branch;
4735      do      do
4736        {        {
4737        int prev_length = GET(last_branch, 1);        int prev_length = GET(last_branch, 1);
4738        PUT(last_branch, 1, length);        PUT(last_branch, 1, branch_length);
4739        length = prev_length;        branch_length = prev_length;
4740        last_branch -= length;        last_branch -= branch_length;
4741        }        }
4742      while (length > 0);      while (branch_length > 0);
4743    
4744      /* Fill in the ket */      /* Fill in the ket */
4745    
# Line 3622  for (;;) Line 4753  for (;;)
4753        {        {
4754        *code++ = OP_OPT;        *code++ = OP_OPT;
4755        *code++ = oldims;        *code++ = oldims;
4756          length += 2;
4757        }        }
4758    
4759      /* Set values to pass back */      /* Set values to pass back */
# Line 3630  for (;;) Line 4762  for (;;)
4762      *ptrptr = ptr;      *ptrptr = ptr;
4763      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4764      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4765        if (lengthptr != NULL) *lengthptr += length;
4766      return TRUE;      return TRUE;
4767      }      }
4768    
# Line 3643  for (;;) Line 4776  for (;;)
4776    bc.current = last_branch = code;    bc.current = last_branch = code;
4777    code += 1 + LINK_SIZE;    code += 1 + LINK_SIZE;
4778    ptr++;    ptr++;
4779      length += 1 + LINK_SIZE;
4780    }    }
4781  /* Control never reaches here */  /* Control never reaches here */
4782  }  }
# Line 3693  is_anchored(register const uschar *code, Line 4827  is_anchored(register const uschar *code,
4827    unsigned int backref_map)    unsigned int backref_map)
4828  {  {
4829  do {  do {
4830     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4831       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4832     register int op = *scode;     register int op = *scode;
4833    
4834       /* Non-capturing brackets */
4835    
4836       if (op == OP_BRA)
4837         {
4838         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4839         }
4840    
4841     /* Capturing brackets */     /* Capturing brackets */
4842    
4843     if (op > OP_BRA)     else if (op == OP_CBRA)
4844       {       {
4845       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4846       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4847       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4848       }       }
4849    
4850     /* Other brackets */     /* Other brackets */
4851    
4852     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4853       {       {
4854       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4855       }       }
# Line 3718  do { Line 4857  do {
4857     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4858     are or may be referenced. */     are or may be referenced. */
4859    
4860     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4861                 op == OP_TYPEPOSSTAR) &&
4862              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4863       {       {
4864       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3763  is_startline(const uschar *code, unsigne Line 4903  is_startline(const uschar *code, unsigne
4903    unsigned int backref_map)    unsigned int backref_map)
4904  {  {
4905  do {  do {
4906     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4907       FALSE);       NULL, 0, FALSE);
4908     register int op = *scode;     register int op = *scode;
4909    
4910       /* Non-capturing brackets */
4911    
4912       if (op == OP_BRA)
4913         {
4914         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4915         }
4916    
4917     /* Capturing brackets */     /* Capturing brackets */
4918    
4919     if (op > OP_BRA)     else if (op == OP_CBRA)
4920       {       {
4921       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4922       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4923       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
4924       }       }
4925    
4926     /* Other brackets */     /* Other brackets */
4927    
4928     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4929       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4930    
4931     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
4932     may be referenced. */     may be referenced. */
4933    
4934     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4935       {       {
4936       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4937       }       }
# Line 3835  do { Line 4980  do {
4980       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4981     register int op = *scode;     register int op = *scode;
4982    
    if (op >= OP_BRA) op = OP_BRA;  
   
4983     switch(op)     switch(op)
4984       {       {
4985       default:       default:
4986       return -1;       return -1;
4987    
4988       case OP_BRA:       case OP_BRA:
4989         case OP_CBRA:
4990       case OP_ASSERT:       case OP_ASSERT:
4991       case OP_ONCE:       case OP_ONCE:
4992       case OP_COND:       case OP_COND:
# Line 3858  do { Line 5002  do {
5002       case OP_CHARNC:       case OP_CHARNC:
5003       case OP_PLUS:       case OP_PLUS:
5004       case OP_MINPLUS:       case OP_MINPLUS:
5005         case OP_POSPLUS:
5006       if (!inassert) return -1;       if (!inassert) return -1;
5007       if (c < 0)       if (c < 0)
5008         {         {
# Line 3911  pcre_compile2(const char *pattern, int o Line 5056  pcre_compile2(const char *pattern, int o
5056    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5057  {  {
5058  real_pcre *re;  real_pcre *re;
5059  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5060  int c, firstbyte, reqbyte;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5061  int errorcode = 0;  int errorcode = 0;
5062  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5063  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5064  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5065  size_t size;  size_t size;
5066  uschar *code;  uschar *code;
5067  const uschar *codestart;  const uschar *codestart;
5068  const uschar *ptr;  const uschar *ptr;
5069  compile_data compile_block;  compile_data compile_block;
5070  int brastack[BRASTACK_SIZE];  compile_data *cd = &compile_block;
5071  uschar bralenstack[BRASTACK_SIZE];  
5072    /* This space is used for "compiling" into during the first phase, when we are
5073    computing the amount of memory that is needed. Compiled items are thrown away
5074    as soon as possible, so that a fairly large buffer should be sufficient for
5075    this purpose. The same space is used in the second phase for remembering where
5076    to fill in forward references to subpatterns. */
5077    
5078    uschar cworkspace[COMPILE_WORK_SIZE];
5079    
5080    
5081    /* Set this early so that early errors get offset 0. */
5082    
5083    ptr = (const uschar *)pattern;
5084    
5085  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5086  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 3967  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5113  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5113       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5114    {    {
5115    errorcode = ERR44;    errorcode = ERR44;
5116    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_UTF8_ERROR_RETURN;
5117    }    }
5118  #else  #else
5119  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 3986  if ((options & ~PUBLIC_OPTIONS) != 0) Line 5132  if ((options & ~PUBLIC_OPTIONS) != 0)
5132  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
5133    
5134  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
5135  compile_block.lcc = tables + lcc_offset;  cd->lcc = tables + lcc_offset;
5136  compile_block.fcc = tables + fcc_offset;  cd->fcc = tables + fcc_offset;
5137  compile_block.cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5138  compile_block.ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
   
 /* Maximum back reference and backref bitmap. This is updated for numeric  
 references during the first pass, but for named references during the actual  
 compile pass. The bitmap records up to 31 back references to help in deciding  
 whether (.*) can be treated as anchored or not. */  
   
 compile_block.top_backref = 0;  
 compile_block.backref_map = 0;  
   
 /* Reflect pattern for debugging output */  
   
 DPRINTF(("------------------------------------------------------------------\n"));  
 DPRINTF(("%s\n", pattern));  
5139    
5140  /* The first thing to do is to make a pass over the pattern to compute the  /* Handle different types of newline. The three bits give seven cases. The
5141  amount of store required to hold the compiled code. This does not have to be  current code allows for fixed one- or two-byte sequences, plus "any". */
 perfect as long as errors are overestimates. At the same time we can detect any  
 flag settings right at the start, and extract them. Make an attempt to correct  
 for any counted white space if an "extended" flag setting appears late in the  
 pattern. We can't be so clever for #-comments. */  
   
 ptr = (const uschar *)(pattern - 1);  
 while ((c = *(++ptr)) != 0)  
   {  
   int min, max;  
   int class_optcount;  
   int bracket_length;  
   int duplength;  
5142    
5143    /* If we are inside a \Q...\E sequence, all chars are literal */  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5144      {
5145      case 0: newline = NEWLINE; break;   /* Compile-time default */
5146      case PCRE_NEWLINE_CR: newline = '\r'; break;
5147      case PCRE_NEWLINE_LF: newline = '\n'; break;
5148      case PCRE_NEWLINE_CR+
5149           PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5150      case PCRE_NEWLINE_ANY: newline = -1; break;
5151      default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5152      }
5153    
5154    if (inescq)  if (newline < 0)
5155      {
5156      cd->nltype = NLTYPE_ANY;
5157      }
5158    else
5159      {
5160      cd->nltype = NLTYPE_FIXED;
5161      if (newline > 255)
5162      {      {
5163      if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;      cd->nllen = 2;
5164      goto NORMAL_CHAR;      cd->nl[0] = (newline >> 8) & 255;
5165        cd->nl[1] = newline & 255;
5166      }      }
5167      else
   /* Otherwise, first check for ignored whitespace and comments */  
   
   if ((options & PCRE_EXTENDED) != 0)  
5168      {      {
5169      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      cd->nllen = 1;
5170      if (c == '#')      cd->nl[0] = newline;
       {  
       /* The space before the ; is to avoid a warning on a silly compiler  
       on the Macintosh. */  
       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;  
       if (c == 0) break;  
       continue;  
       }  
5171      }      }
5172      }
5173    
5174    item_count++;    /* Is zero for the first non-comment item */  /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5175    references to help in deciding whether (.*) can be treated as anchored or not.
5176    /* Allow space for auto callout before every item except quantifiers. */  */
   
   if ((options & PCRE_AUTO_CALLOUT) != 0 &&  
        c != '*' && c != '+' && c != '?' &&  
        (c != '{' || !is_counted_repeat(ptr + 1)))  
     length += 2 + 2*LINK_SIZE;  
5177    
5178    switch(c)  cd->top_backref = 0;
5179      {  cd->backref_map = 0;
     /* A backslashed item may be an escaped data character or it may be a  
     character type. */  
5180    
5181      case '\\':  /* Reflect pattern for debugging output */
     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);  
     if (errorcode != 0) goto PCRE_ERROR_RETURN;  
5182    
5183      lastitemlength = 1;     /* Default length of last item for repeats */  DPRINTF(("------------------------------------------------------------------\n"));
5184    DPRINTF(("%s\n", pattern));
5185    
5186      if (c >= 0)             /* Data character */  /* Pretend to compile the pattern while actually just accumulating the length
5187        {  of memory required. This behaviour is triggered by passing a non-NULL final
5188        length += 2;          /* For a one-byte character */  argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5189    to compile parts of the pattern into; the compiled code is discarded when it is
5190    no longer needed, so hopefully this workspace will never overflow, though there
5191    is a test for its doing so. */
5192    
5193    cd->bracount = 0;
5194    cd->names_found = 0;
5195    cd->name_entry_size = 0;
5196    cd->name_table = NULL;
5197    cd->start_workspace = cworkspace;
5198    cd->start_code = cworkspace;
5199    cd->hwm = cworkspace;
5200    cd->start_pattern = (const uschar *)pattern;
5201    cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5202    cd->req_varyopt = 0;
5203    cd->nopartial = FALSE;
5204    cd->external_options = options;
5205    
5206    /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5207    don't need to look at the result of the function here. The initial options have
5208    been put into the cd block so that they can be changed if an option setting is
5209    found within the regex right at the beginning. Bringing initial option settings
5210    outside can help speed up starting point checks. */
5211    
5212  #ifdef SUPPORT_UTF8  code = cworkspace;
5213        if (utf8 && c > 127)  *code = OP_BRA;
5214          {  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5215          int i;    &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5216          for (i = 0; i < _pcre_utf8_table1_size; i++)  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
           if (c <= _pcre_utf8_table1[i]) break;  
         length += i;  
         lastitemlength += i;  
         }  
 #endif  
5217    
5218        continue;  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5219        }    cd->hwm - cworkspace));
5220    
5221      /* If \Q, enter "literal" mode */  if (length > MAX_PATTERN_SIZE)
5222      {
5223      errorcode = ERR20;
5224      goto PCRE_EARLY_ERROR_RETURN;
5225      }
5226    
5227      if (-c == ESC_Q)  /* Compute the size of data block needed and get it, either from malloc or
5228        {  externally provided function. Integer overflow should no longer be possible
5229        inescq = TRUE;  because nowadays we limit the maximum value of cd->names_found and
5230        continue;  cd->name_entry_size. */
       }  
5231    
5232      /* \X is supported only if Unicode property support is compiled */  size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5233    re = (real_pcre *)(pcre_malloc)(size);
5234    
5235  #ifndef SUPPORT_UCP  if (re == NULL)
5236      if (-c == ESC_X)    {
5237        {    errorcode = ERR21;
5238        errorcode = ERR45;    goto PCRE_EARLY_ERROR_RETURN;
5239        goto PCRE_ERROR_RETURN;    }
       }  
 #endif  
5240    
5241      /* \P and \p are for Unicode properties, but only when the support has  /* Put in the magic number, and save the sizes, initial options, and character
5242      been compiled. Each item needs 3 bytes. */  table pointer. NULL is used for the default character tables. The nullpad field
5243    is at the end; it's there to help in the case when a regex compiled on a system
5244      else if (-c == ESC_P || -c == ESC_p)  with 4-byte pointers is run on another with 8-byte pointers. */
       {  
 #ifdef SUPPORT_UCP  
       BOOL negated;  
       BOOL pdata;  
       length += 3;  
       lastitemlength = 3;  
       if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)  
         goto PCRE_ERROR_RETURN;  
       continue;  
 #else  
       errorcode = ERR45;  
       goto PCRE_ERROR_RETURN;  
 #endif  
       }  
   
     /* Other escapes need one byte */  
   
     length++;  
   
     /* A back reference needs an additional 2 bytes, plus either one or 5  
     bytes for a repeat. We also need to keep the value of the highest  
     back reference. */  
   
     if (c <= -ESC_REF)  
       {  
       int refnum = -c - ESC_REF;  
       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;  
       if (refnum > compile_block.top_backref)  
         compile_block.top_backref = refnum;  
       length += 2;   /* For single back reference */  
       if (ptr[1] == '{' && is_counted_repeat(ptr+2))  
         {  
         ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);  
         if (errorcode != 0) goto PCRE_ERROR_RETURN;  
         if ((min == 0 && (max == 1 || max == -1)) ||  
           (min == 1 && max == -1))  
             length++;  
         else length += 5;  
         if (ptr[1] == '?') ptr++;  
         }  
       }  
     continue;