/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 323 by ph10, Wed Mar 5 17:23:42 2008 UTC revision 496 by ph10, Tue Mar 2 19:11:17 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 53  supporting internal functions that are n Line 53  supporting internal functions that are n
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
# Line 97  are simple data values; negative values Line 98  are simple data values; negative values
98  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
99  is invalid. */  is invalid. */
100    
101  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
102    
103    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104    in UTF-8 mode. */
105    
106  static const short int escapes[] = {  static const short int escapes[] = {
107       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
108       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
109     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
110  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */       0,                       0,
111  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */       0,                       0,
112  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
113     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
114  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
115  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
116       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
117         -ESC_D,                  -ESC_E,
118         0,                       -ESC_G,
119         -ESC_H,                  0,
120         0,                       -ESC_K,
121         0,                       0,
122         0,                       0,
123         -ESC_P,                  -ESC_Q,
124         -ESC_R,                  -ESC_S,
125         0,                       0,
126         -ESC_V,                  -ESC_W,
127         -ESC_X,                  0,
128         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
129         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
130         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
131         CHAR_GRAVE_ACCENT,       7,
132         -ESC_b,                  0,
133         -ESC_d,                  ESC_e,
134         ESC_f,                   0,
135         -ESC_h,                  0,
136         0,                       -ESC_k,
137         0,                       0,
138         ESC_n,                   0,
139         -ESC_p,                  0,
140         ESC_r,                   -ESC_s,
141         ESC_tee,                 0,
142         -ESC_v,                  -ESC_w,
143         0,                       0,
144         -ESC_z
145  };  };
146    
147  #else           /* This is the "abnormal" table for EBCDIC systems */  #else
148    
149    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150    
151  static const short int escapes[] = {  static const short int escapes[] = {
152  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
153  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 142  static const short int escapes[] = { Line 178  static const short int escapes[] = {
178    
179  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180  searched linearly. Put all the names into a single string, in order to reduce  searched linearly. Put all the names into a single string, in order to reduce
181  the number of relocations when a shared library is dynamically linked. */  the number of relocations when a shared library is dynamically linked. The
182    string is built from string macros so that it works in UTF-8 mode on EBCDIC
183    platforms. */
184    
185  typedef struct verbitem {  typedef struct verbitem {
186    int   len;    int   len;
# Line 150  typedef struct verbitem { Line 188  typedef struct verbitem {
188  } verbitem;  } verbitem;
189    
190  static const char verbnames[] =  static const char verbnames[] =
191    "ACCEPT\0"    STRING_ACCEPT0
192    "COMMIT\0"    STRING_COMMIT0
193    "F\0"    STRING_F0
194    "FAIL\0"    STRING_FAIL0
195    "PRUNE\0"    STRING_PRUNE0
196    "SKIP\0"    STRING_SKIP0
197    "THEN";    STRING_THEN;
198    
199  static verbitem verbs[] = {  static const verbitem verbs[] = {
200    { 6, OP_ACCEPT },    { 6, OP_ACCEPT },
201    { 6, OP_COMMIT },    { 6, OP_COMMIT },
202    { 1, OP_FAIL },    { 1, OP_FAIL },
# Line 168  static verbitem verbs[] = { Line 206  static verbitem verbs[] = {
206    { 4, OP_THEN  }    { 4, OP_THEN  }
207  };  };
208    
209  static int verbcount = sizeof(verbs)/sizeof(verbitem);  static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210    
211    
212  /* Tables of names of POSIX character classes and their lengths. The names are  /* Tables of names of POSIX character classes and their lengths. The names are
# Line 178  length entry. The first three must be al Line 216  length entry. The first three must be al
216  for handling case independence. */  for handling case independence. */
217    
218  static const char posix_names[] =  static const char posix_names[] =
219    "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"    STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220    "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"    STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221    "word\0"   "xdigit";    STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222      STRING_word0  STRING_xdigit;
223    
224  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
225    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 295  static const char error_texts[] = Line 334  static const char error_texts[] =
334    /* 55 */    /* 55 */
335    "repeating a DEFINE group is not allowed\0"    "repeating a DEFINE group is not allowed\0"
336    "inconsistent NEWLINE options\0"    "inconsistent NEWLINE options\0"
337    "\\g is not followed by a braced name or an optionally braced non-zero number\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
338    "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"    "a numbered reference must not be zero\0"
339    "(*VERB) with an argument is not supported\0"    "(*VERB) with an argument is not supported\0"
340    /* 60 */    /* 60 */
341    "(*VERB) not recognized\0"    "(*VERB) not recognized\0"
342    "number is too big\0"    "number is too big\0"
343    "subpattern name expected\0"    "subpattern name expected\0"
344    "digit expected after (?+";    "digit expected after (?+\0"
345      "] is an invalid data character in JavaScript compatibility mode\0"
346      /* 65 */
347      "different names for subpatterns of the same number are not allowed";
348    
349    
350  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 321  For convenience, we use the same bit def Line 363  For convenience, we use the same bit def
363    
364  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
365    
366  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
367    
368    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
369    UTF-8 mode. */
370    
371  static const unsigned char digitab[] =  static const unsigned char digitab[] =
372    {    {
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 357  static const unsigned char digitab[] = Line 403  static const unsigned char digitab[] =
403    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
404    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
405    
406  #else           /* This is the "abnormal" case, for EBCDIC systems */  #else
407    
408    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
409    
410  static const unsigned char digitab[] =  static const unsigned char digitab[] =
411    {    {
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 454  static const char * Line 503  static const char *
503  find_error_text(int n)  find_error_text(int n)
504  {  {
505  const char *s = error_texts;  const char *s = error_texts;
506  for (; n > 0; n--) while (*s++ != 0);  for (; n > 0; n--) while (*s++ != 0) {};
507  return s;  return s;
508  }  }
509    
# Line 502  if (c == 0) *errorcodeptr = ERR1; Line 551  if (c == 0) *errorcodeptr = ERR1;
551  in a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
552  Otherwise further processing may be required. */  Otherwise further processing may be required. */
553    
554  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
555  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
556  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
557    
558  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
559  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
# Line 523  else Line 572  else
572      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
573      error. */      error. */
574    
575      case 'l':      case CHAR_l:
576      case 'L':      case CHAR_L:
577      case 'N':      case CHAR_N:
578      case 'u':      case CHAR_u:
579      case 'U':      case CHAR_U:
580      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
581      break;      break;
582    
583      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by one of a number of specific things:
     is an absolute backreference. If negative, it is a relative backreference.  
     This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a  
     reference to a named group. This is part of Perl's movement towards a  
     unified syntax for back references. As this is synonymous with \k{name}, we  
     fudge it up by pretending it really was \k. */  
584    
585      case 'g':      (1) A number, either plain or braced. If positive, it is an absolute
586      if (ptr[1] == '{')      backreference. If negative, it is a relative backreference. This is a Perl
587        5.10 feature.
588    
589        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
590        is part of Perl's movement towards a unified syntax for back references. As
591        this is synonymous with \k{name}, we fudge it up by pretending it really
592        was \k.
593    
594        (3) For Oniguruma compatibility we also support \g followed by a name or a
595        number either in angle brackets or in single quotes. However, these are
596        (possibly recursive) subroutine calls, _not_ backreferences. Just return
597        the -ESC_g code (cf \k). */
598    
599        case CHAR_g:
600        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
601          {
602          c = -ESC_g;
603          break;
604          }
605    
606        /* Handle the Perl-compatible cases */
607    
608        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
609        {        {
610        const uschar *p;        const uschar *p;
611        for (p = ptr+2; *p != 0 && *p != '}'; p++)        for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
612          if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;          if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
613        if (*p != 0 && *p != '}')        if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
614          {          {
615          c = -ESC_k;          c = -ESC_k;
616          break;          break;
# Line 554  else Line 620  else
620        }        }
621      else braced = FALSE;      else braced = FALSE;
622    
623      if (ptr[1] == '-')      if (ptr[1] == CHAR_MINUS)
624        {        {
625        negated = TRUE;        negated = TRUE;
626        ptr++;        ptr++;
# Line 563  else Line 629  else
629    
630      c = 0;      c = 0;
631      while ((digitab[ptr[1]] & ctype_digit) != 0)      while ((digitab[ptr[1]] & ctype_digit) != 0)
632        c = c * 10 + *(++ptr) - '0';        c = c * 10 + *(++ptr) - CHAR_0;
633    
634      if (c < 0)      if (c < 0)   /* Integer overflow */
635        {        {
636        *errorcodeptr = ERR61;        *errorcodeptr = ERR61;
637        break;        break;
638        }        }
639    
640      if (c == 0 || (braced && *(++ptr) != '}'))      if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
641        {        {
642        *errorcodeptr = ERR57;        *errorcodeptr = ERR57;
643        break;        break;
644        }        }
645    
646        if (c == 0)
647          {
648          *errorcodeptr = ERR58;
649          break;
650          }
651    
652      if (negated)      if (negated)
653        {        {
654        if (c > bracount)        if (c > bracount)
# Line 602  else Line 674  else
674      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
675      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
676    
677      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
678      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
679    
680      if (!isclass)      if (!isclass)
681        {        {
682        oldptr = ptr;        oldptr = ptr;
683        c -= '0';        c -= CHAR_0;
684        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
685          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
686        if (c < 0)        if (c < 0)    /* Integer overflow */
687          {          {
688          *errorcodeptr = ERR61;          *errorcodeptr = ERR61;
689          break;          break;
# Line 628  else Line 700  else
700      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
701      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
702    
703      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
704        {        {
705        ptr--;        ptr--;
706        c = 0;        c = 0;
# Line 641  else Line 713  else
713      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
714      than 3 octal digits. */      than 3 octal digits. */
715    
716      case '0':      case CHAR_0:
717      c -= '0';      c -= CHAR_0;
718      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
719          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - CHAR_0;
720      if (!utf8 && c > 255) *errorcodeptr = ERR51;      if (!utf8 && c > 255) *errorcodeptr = ERR51;
721      break;      break;
722    
# Line 652  else Line 724  else
724      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
725      treated as a data character. */      treated as a data character. */
726    
727      case 'x':      case CHAR_x:
728      if (ptr[1] == '{')      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
729        {        {
730        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
731        int count = 0;        int count = 0;
# Line 662  else Line 734  else
734        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
735          {          {
736          register int cc = *pt++;          register int cc = *pt++;
737          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
738          count++;          count++;
739    
740  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
741          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
742          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
743  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
744          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
745          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
746  #endif  #endif
747          }          }
748    
749        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
750          {          {
751          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
752          ptr = pt;          ptr = pt;
# Line 690  else Line 762  else
762      c = 0;      c = 0;
763      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
764        {        {
765        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
766        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
767  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
768        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
769        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
770  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
771        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
772        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
773  #endif  #endif
774        }        }
775      break;      break;
# Line 706  else Line 778  else
778      This coding is ASCII-specific, but then the whole concept of \cx is      This coding is ASCII-specific, but then the whole concept of \cx is
779      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */      ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
780    
781      case 'c':      case CHAR_c:
782      c = *(++ptr);      c = *(++ptr);
783      if (c == 0)      if (c == 0)
784        {        {
# Line 714  else Line 786  else
786        break;        break;
787        }        }
788    
789  #ifndef EBCDIC  /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
790      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
791      c ^= 0x40;      c ^= 0x40;
792  #else           /* EBCDIC coding */  #else           /* EBCDIC coding */
793      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
794      c ^= 0xC0;      c ^= 0xC0;
795  #endif  #endif
796      break;      break;
# Line 780  if (c == 0) goto ERROR_RETURN; Line 852  if (c == 0) goto ERROR_RETURN;
852  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
853  negation. */  negation. */
854    
855  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
856    {    {
857    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
858      {      {
859      *negptr = TRUE;      *negptr = TRUE;
860      ptr++;      ptr++;
# Line 791  if (c == '{') Line 863  if (c == '{')
863      {      {
864      c = *(++ptr);      c = *(++ptr);
865      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
866      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
867      name[i] = c;      name[i] = c;
868      }      }
869    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
870    name[i] = 0;    name[i] = 0;
871    }    }
872    
# Line 859  is_counted_repeat(const uschar *p) Line 931  is_counted_repeat(const uschar *p)
931  {  {
932  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
933  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
934  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935    
936  if (*p++ != ',') return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
937  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
938    
939  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
941    
942  return (*p == '}');  return (*p == CHAR_RIGHT_CURLY_BRACKET);
943  }  }
944    
945    
# Line 900  int max = -1; Line 972  int max = -1;
972  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
973  an integer overflow. */  an integer overflow. */
974    
975  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
976  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
977    {    {
978    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 910  if (min < 0 || min > 65535) Line 982  if (min < 0 || min > 65535)
982  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
983  Also, max must not be less than min. */  Also, max must not be less than min. */
984    
985  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
986    {    {
987    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
988      {      {
989      max = 0;      max = 0;
990      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
991      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
992        {        {
993        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 940  return p; Line 1012  return p;
1012    
1013    
1014  /*************************************************  /*************************************************
1015  *       Find forward referenced subpattern       *  *  Subroutine for finding forward reference      *
1016  *************************************************/  *************************************************/
1017    
1018  /* This function scans along a pattern's text looking for capturing  /* This recursive function is called only from find_parens() below. The
1019    top-level call starts at the beginning of the pattern. All other calls must
1020    start at a parenthesis. It scans along a pattern's text looking for capturing
1021  subpatterns, and counting them. If it finds a named pattern that matches the  subpatterns, and counting them. If it finds a named pattern that matches the
1022  name it is given, it returns its number. Alternatively, if the name is NULL, it  name it is given, it returns its number. Alternatively, if the name is NULL, it
1023  returns when it reaches a given numbered subpattern. This is used for forward  returns when it reaches a given numbered subpattern. We know that if (?P< is
1024  references to subpatterns. We know that if (?P< is encountered, the name will  encountered, the name will be terminated by '>' because that is checked in the
1025  be terminated by '>' because that is checked in the first pass.  first pass. Recursion is used to keep track of subpatterns that reset the
1026    capturing group numbers - the (?| feature.
1027    
1028  Arguments:  Arguments:
1029    ptr          current position in the pattern    ptrptr       address of the current character pointer (updated)
1030    count        current count of capturing parens so far encountered    cd           compile background data
1031    name         name to seek, or NULL if seeking a numbered subpattern    name         name to seek, or NULL if seeking a numbered subpattern
1032    lorn         name length, or subpattern number if name is NULL    lorn         name length, or subpattern number if name is NULL
1033    xmode        TRUE if we are in /x mode    xmode        TRUE if we are in /x mode
1034      count        pointer to the current capturing subpattern number (updated)
1035    
1036  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
1037  */  */
1038    
1039  static int  static int
1040  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,  find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1041    BOOL xmode)    BOOL xmode, int *count)
1042  {  {
1043  const uschar *thisname;  uschar *ptr = *ptrptr;
1044    int start_count = *count;
1045    int hwm_count = start_count;
1046    BOOL dup_parens = FALSE;
1047    
1048  for (; *ptr != 0; ptr++)  /* If the first character is a parenthesis, check on the type of group we are
1049    dealing with. The very first call may not start with a parenthesis. */
1050    
1051    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1052    {    {
1053    int term;    if (ptr[1] == CHAR_QUESTION_MARK &&
1054          ptr[2] == CHAR_VERTICAL_LINE)
1055        {
1056        ptr += 3;
1057        dup_parens = TRUE;
1058        }
1059    
1060      /* Handle a normal, unnamed capturing parenthesis */
1061    
1062      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1063        {
1064        *count += 1;
1065        if (name == NULL && *count == lorn) return *count;
1066        ptr++;
1067        }
1068    
1069      /* Handle a condition. If it is an assertion, just carry on so that it
1070      is processed as normal. If not, skip to the closing parenthesis of the
1071      condition (there can't be any nested parens. */
1072    
1073      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1074        {
1075        ptr += 2;
1076        if (ptr[1] != CHAR_QUESTION_MARK)
1077          {
1078          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1079          if (*ptr != 0) ptr++;
1080          }
1081        }
1082    
1083      /* We have either (? or (* and not a condition */
1084    
1085      else
1086        {
1087        ptr += 2;
1088        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1089    
1090        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1091    
1092        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1093            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1094          {
1095          int term;
1096          const uschar *thisname;
1097          *count += 1;
1098          if (name == NULL && *count == lorn) return *count;
1099          term = *ptr++;
1100          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1101          thisname = ptr;
1102          while (*ptr != term) ptr++;
1103          if (name != NULL && lorn == ptr - thisname &&
1104              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1105            return *count;
1106          term++;
1107          }
1108        }
1109      }
1110    
1111    /* Past any initial parenthesis handling, scan for parentheses or vertical
1112    bars. */
1113    
1114    for (; *ptr != 0; ptr++)
1115      {
1116    /* Skip over backslashed characters and also entire \Q...\E */    /* Skip over backslashed characters and also entire \Q...\E */
1117    
1118    if (*ptr == '\\')    if (*ptr == CHAR_BACKSLASH)
1119      {      {
1120      if (*(++ptr) == 0) return -1;      if (*(++ptr) == 0) goto FAIL_EXIT;
1121      if (*ptr == 'Q') for (;;)      if (*ptr == CHAR_Q) for (;;)
1122        {        {
1123        while (*(++ptr) != 0 && *ptr != '\\');        while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1124        if (*ptr == 0) return -1;        if (*ptr == 0) goto FAIL_EXIT;
1125        if (*(++ptr) == 'E') break;        if (*(++ptr) == CHAR_E) break;
1126        }        }
1127      continue;      continue;
1128      }      }
1129    
1130    /* Skip over character classes */    /* Skip over character classes; this logic must be similar to the way they
1131      are handled for real. If the first character is '^', skip it. Also, if the
1132      first few characters (either before or after ^) are \Q\E or \E we skip them
1133      too. This makes for compatibility with Perl. Note the use of STR macros to
1134      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1135    
1136    if (*ptr == '[')    if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1137      {      {
1138      while (*(++ptr) != ']')      BOOL negate_class = FALSE;
1139        for (;;)
1140          {
1141          if (ptr[1] == CHAR_BACKSLASH)
1142            {
1143            if (ptr[2] == CHAR_E)
1144              ptr+= 2;
1145            else if (strncmp((const char *)ptr+2,
1146                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1147              ptr += 4;
1148            else
1149              break;
1150            }
1151          else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1152            {
1153            negate_class = TRUE;
1154            ptr++;
1155            }
1156          else break;
1157          }
1158    
1159        /* If the next character is ']', it is a data character that must be
1160        skipped, except in JavaScript compatibility mode. */
1161    
1162        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1163            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1164          ptr++;
1165    
1166        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1167        {        {
1168        if (*ptr == 0) return -1;        if (*ptr == 0) return -1;
1169        if (*ptr == '\\')        if (*ptr == CHAR_BACKSLASH)
1170          {          {
1171          if (*(++ptr) == 0) return -1;          if (*(++ptr) == 0) goto FAIL_EXIT;
1172          if (*ptr == 'Q') for (;;)          if (*ptr == CHAR_Q) for (;;)
1173            {            {
1174            while (*(++ptr) != 0 && *ptr != '\\');            while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1175            if (*ptr == 0) return -1;            if (*ptr == 0) goto FAIL_EXIT;
1176            if (*(++ptr) == 'E') break;            if (*(++ptr) == CHAR_E) break;
1177            }            }
1178          continue;          continue;
1179          }          }
# Line 1008  for (; *ptr != 0; ptr++) Line 1183  for (; *ptr != 0; ptr++)
1183    
1184    /* Skip comments in /x mode */    /* Skip comments in /x mode */
1185    
1186    if (xmode && *ptr == '#')    if (xmode && *ptr == CHAR_NUMBER_SIGN)
1187      {      {
1188      while (*(++ptr) != 0 && *ptr != '\n');      while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1189      if (*ptr == 0) return -1;      if (*ptr == 0) goto FAIL_EXIT;
1190      continue;      continue;
1191      }      }
1192    
1193    /* An opening parens must now be a real metacharacter */    /* Check for the special metacharacters */
1194    
1195    if (*ptr != '(') continue;    if (*ptr == CHAR_LEFT_PARENTHESIS)
   if (ptr[1] != '?' && ptr[1] != '*')  
1196      {      {
1197      count++;      int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1198      if (name == NULL && count == lorn) return count;      if (rc > 0) return rc;
1199      continue;      if (*ptr == 0) goto FAIL_EXIT;
1200      }      }
1201    
1202    ptr += 2;    else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1203    if (*ptr == 'P') ptr++;                      /* Allow optional P */      {
1204        if (dup_parens && *count < hwm_count) *count = hwm_count;
1205        *ptrptr = ptr;
1206        return -1;
1207        }
1208    
1209    /* We have to disambiguate (?<! and (?<= from (?<name> */    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1210        {
1211        if (*count > hwm_count) hwm_count = *count;
1212        *count = start_count;
1213        }
1214      }
1215    
1216    if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&  FAIL_EXIT:
1217         *ptr != '\'')  *ptrptr = ptr;
1218      continue;  return -1;
1219    }
1220    
1221    
1222    
1223    
1224    /*************************************************
1225    *       Find forward referenced subpattern       *
1226    *************************************************/
1227    
1228    /* This function scans along a pattern's text looking for capturing
1229    subpatterns, and counting them. If it finds a named pattern that matches the
1230    name it is given, it returns its number. Alternatively, if the name is NULL, it
1231    returns when it reaches a given numbered subpattern. This is used for forward
1232    references to subpatterns. We used to be able to start this scan from the
1233    current compiling point, using the current count value from cd->bracount, and
1234    do it all in a single loop, but the addition of the possibility of duplicate
1235    subpattern numbers means that we have to scan from the very start, in order to
1236    take account of such duplicates, and to use a recursive function to keep track
1237    of the different types of group.
1238    
1239    Arguments:
1240      cd           compile background data
1241      name         name to seek, or NULL if seeking a numbered subpattern
1242      lorn         name length, or subpattern number if name is NULL
1243      xmode        TRUE if we are in /x mode
1244    
1245    Returns:       the number of the found subpattern, or -1 if not found
1246    */
1247    
1248    count++;  static int
1249    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1250    {
1251    uschar *ptr = (uschar *)cd->start_pattern;
1252    int count = 0;
1253    int rc;
1254    
1255    /* If the pattern does not start with an opening parenthesis, the first call
1256    to find_parens_sub() will scan right to the end (if necessary). However, if it
1257    does start with a parenthesis, find_parens_sub() will return when it hits the
1258    matching closing parens. That is why we have to have a loop. */
1259    
1260    if (name == NULL && count == lorn) return count;  for (;;)
1261    term = *ptr++;    {
1262    if (term == '<') term = '>';    rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1263    thisname = ptr;    if (rc > 0 || *ptr++ == 0) break;
   while (*ptr != term) ptr++;  
   if (name != NULL && lorn == ptr - thisname &&  
       strncmp((const char *)name, (const char *)thisname, lorn) == 0)  
     return count;  
1264    }    }
1265    
1266  return -1;  return rc;
1267  }  }
1268    
1269    
1270    
1271    
1272  /*************************************************  /*************************************************
1273  *      Find first significant op code            *  *      Find first significant op code            *
1274  *************************************************/  *************************************************/
# Line 1100  for (;;) Line 1318  for (;;)
1318    
1319      case OP_CALLOUT:      case OP_CALLOUT:
1320      case OP_CREF:      case OP_CREF:
1321        case OP_NCREF:
1322      case OP_RREF:      case OP_RREF:
1323        case OP_NRREF:
1324      case OP_DEF:      case OP_DEF:
1325      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1326      break;      break;
# Line 1116  for (;;) Line 1336  for (;;)
1336    
1337    
1338  /*************************************************  /*************************************************
1339  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1340  *************************************************/  *************************************************/
1341    
1342  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1343  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1344  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1345    temporarily terminated with OP_END when this function is called.
1346    
1347    This function is called when a backward assertion is encountered, so that if it
1348    fails, the error message can point to the correct place in the pattern.
1349    However, we cannot do this when the assertion contains subroutine calls,
1350    because they can be forward references. We solve this by remembering this case
1351    and doing the check at the end; a flag specifies which mode we are running in.
1352    
1353  Arguments:  Arguments:
1354    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1355    options  the compiling options    options  the compiling options
1356      atend    TRUE if called when the pattern is complete
1357      cd       the "compile data" structure
1358    
1359  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1360                 or -1 if there is no fixed length,
1361               or -2 if \C was encountered               or -2 if \C was encountered
1362                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1363  */  */
1364    
1365  static int  static int
1366  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1367  {  {
1368  int length = -1;  int length = -1;
1369    
# Line 1145  branch, check the length against that of Line 1376  branch, check the length against that of
1376  for (;;)  for (;;)
1377    {    {
1378    int d;    int d;
1379      uschar *ce, *cs;
1380    register int op = *cc;    register int op = *cc;
1381    switch (op)    switch (op)
1382      {      {
# Line 1152  for (;;) Line 1384  for (;;)
1384      case OP_BRA:      case OP_BRA:
1385      case OP_ONCE:      case OP_ONCE:
1386      case OP_COND:      case OP_COND:
1387      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1388      if (d < 0) return d;      if (d < 0) return d;
1389      branchlength += d;      branchlength += d;
1390      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1175  for (;;) Line 1407  for (;;)
1407      branchlength = 0;      branchlength = 0;
1408      break;      break;
1409    
1410        /* A true recursion implies not fixed length, but a subroutine call may
1411        be OK. If the subroutine is a forward reference, we can't deal with
1412        it until the end of the pattern, so return -3. */
1413    
1414        case OP_RECURSE:
1415        if (!atend) return -3;
1416        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1417        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1418        if (cc > cs && cc < ce) return -1;                /* Recursion */
1419        d = find_fixedlength(cs + 2, options, atend, cd);
1420        if (d < 0) return d;
1421        branchlength += d;
1422        cc += 1 + LINK_SIZE;
1423        break;
1424    
1425      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1426    
1427      case OP_ASSERT:      case OP_ASSERT:
# Line 1188  for (;;) Line 1435  for (;;)
1435    
1436      case OP_REVERSE:      case OP_REVERSE:
1437      case OP_CREF:      case OP_CREF:
1438        case OP_NCREF:
1439      case OP_RREF:      case OP_RREF:
1440        case OP_NRREF:
1441      case OP_DEF:      case OP_DEF:
1442      case OP_OPT:      case OP_OPT:
1443      case OP_CALLOUT:      case OP_CALLOUT:
# Line 1211  for (;;) Line 1460  for (;;)
1460      branchlength++;      branchlength++;
1461      cc += 2;      cc += 2;
1462  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1463      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1464        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1465  #endif  #endif
1466      break;      break;
1467    
# Line 1225  for (;;) Line 1472  for (;;)
1472      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1473      cc += 4;      cc += 4;
1474  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1475      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1476        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1477  #endif  #endif
1478      break;      break;
1479    
# Line 1252  for (;;) Line 1497  for (;;)
1497      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1498      case OP_WORDCHAR:      case OP_WORDCHAR:
1499      case OP_ANY:      case OP_ANY:
1500        case OP_ALLANY:
1501      branchlength++;      branchlength++;
1502      cc++;      cc++;
1503      break;      break;
# Line 1306  for (;;) Line 1552  for (;;)
1552    
1553    
1554  /*************************************************  /*************************************************
1555  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1556  *************************************************/  *************************************************/
1557    
1558  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1559  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1560    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1561    so that it can be called from pcre_study() when finding the minimum matching
1562    length.
1563    
1564  Arguments:  Arguments:
1565    code        points to start of expression    code        points to start of expression
1566    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1567    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1568    
1569  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1570  */  */
1571    
1572  static const uschar *  const uschar *
1573  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1574  {  {
1575  for (;;)  for (;;)
1576    {    {
# Line 1334  for (;;) Line 1583  for (;;)
1583    
1584    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1585    
1586      /* Handle recursion */
1587    
1588      else if (c == OP_REVERSE)
1589        {
1590        if (number < 0) return (uschar *)code;
1591        code += _pcre_OP_lengths[c];
1592        }
1593    
1594    /* Handle capturing bracket */    /* Handle capturing bracket */
1595    
1596    else if (c == OP_CBRA)    else if (c == OP_CBRA)
# Line 1400  for (;;) Line 1657  for (;;)
1657        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1658        break;        break;
1659        }        }
1660    #else
1661        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1662  #endif  #endif
1663      }      }
1664    }    }
# Line 1493  for (;;) Line 1752  for (;;)
1752        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1753        break;        break;
1754        }        }
1755    #else
1756        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1757  #endif  #endif
1758      }      }
1759    }    }
# Line 1544  for (code = first_significant_code(code Line 1805  for (code = first_significant_code(code
1805    
1806    /* Groups with zero repeats can of course be empty; skip them. */    /* Groups with zero repeats can of course be empty; skip them. */
1807    
1808    if (c == OP_BRAZERO || c == OP_BRAMINZERO)    if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1809      {      {
1810      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1811      do code += GET(code, 1); while (*code == OP_ALT);      do code += GET(code, 1); while (*code == OP_ALT);
# Line 1559  for (code = first_significant_code(code Line 1820  for (code = first_significant_code(code
1820      BOOL empty_branch;      BOOL empty_branch;
1821      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1822    
1823      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1824        empty branch, so just skip over the conditional, because it could be empty.
1825        Otherwise, scan the individual branches of the group. */
1826    
1827      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1828        code += GET(code, 1);        code += GET(code, 1);
1829        else
1830          {
1831          empty_branch = FALSE;
1832          do
1833            {
1834            if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1835              empty_branch = TRUE;
1836            code += GET(code, 1);
1837            }
1838          while (*code == OP_ALT);
1839          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1840        }        }
1841      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
1842      c = *code;      c = *code;
1843      continue;      continue;
1844      }      }
# Line 1630  for (code = first_significant_code(code Line 1899  for (code = first_significant_code(code
1899      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1900      case OP_WORDCHAR:      case OP_WORDCHAR:
1901      case OP_ANY:      case OP_ANY:
1902        case OP_ALLANY:
1903      case OP_ANYBYTE:      case OP_ANYBYTE:
1904      case OP_CHAR:      case OP_CHAR:
1905      case OP_CHARNC:      case OP_CHARNC:
# Line 1686  for (code = first_significant_code(code Line 1956  for (code = first_significant_code(code
1956      case OP_QUERY:      case OP_QUERY:
1957      case OP_MINQUERY:      case OP_MINQUERY:
1958      case OP_POSQUERY:      case OP_POSQUERY:
1959        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1960        break;
1961    
1962      case OP_UPTO:      case OP_UPTO:
1963      case OP_MINUPTO:      case OP_MINUPTO:
1964      case OP_POSUPTO:      case OP_POSUPTO:
1965      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1966      break;      break;
1967  #endif  #endif
1968      }      }
# Line 1722  static BOOL Line 1995  static BOOL
1995  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1996    BOOL utf8)    BOOL utf8)
1997  {  {
1998  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
1999    {    {
2000    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2001        return FALSE;
2002    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2003    }    }
2004  return TRUE;  return TRUE;
# Line 1770  int terminator; /* Don't combin Line 2044  int terminator; /* Don't combin
2044  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2045  for (++ptr; *ptr != 0; ptr++)  for (++ptr; *ptr != 0; ptr++)
2046    {    {
2047    if (*ptr == '\\' && ptr[1] == ']') ptr++; else    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2048      {      {
2049      if (*ptr == ']') return FALSE;      if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2050      if (*ptr == terminator && ptr[1] == ']')      if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2051        {        {
2052        *endptr = ptr;        *endptr = ptr;
2053        return TRUE;        return TRUE;
# Line 1824  return -1; Line 2098  return -1;
2098  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2099  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2100  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2101  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2102  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2103  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2104  offsets adjusted. That one of the jobs of this function. Before it is called,  have their offsets adjusted. That one of the jobs of this function. Before it
2105  the partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2106    OP_END.
2107    
2108  This function has been extended with the possibility of forward references for  This function has been extended with the possibility of forward references for
2109  recursions and subroutine calls. It must also check the list of such references  recursions and subroutine calls. It must also check the list of such references
# Line 1963  get_othercase_range(unsigned int *cptr, Line 2238  get_othercase_range(unsigned int *cptr,
2238  unsigned int c, othercase, next;  unsigned int c, othercase, next;
2239    
2240  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2241    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2242    
2243  if (c > d) return FALSE;  if (c > d) return FALSE;
2244    
# Line 1972  next = othercase + 1; Line 2247  next = othercase + 1;
2247    
2248  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2249    {    {
2250    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2251    next++;    next++;
2252    }    }
2253    
# Line 2018  if ((options & PCRE_EXTENDED) != 0) Line 2293  if ((options & PCRE_EXTENDED) != 0)
2293    for (;;)    for (;;)
2294      {      {
2295      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2296      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2297        {        {
2298        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2299          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
# Line 2030  if ((options & PCRE_EXTENDED) != 0) Line 2305  if ((options & PCRE_EXTENDED) != 0)
2305  /* If the next item is one that we can handle, get its value. A non-negative  /* If the next item is one that we can handle, get its value. A non-negative
2306  value is a character, a negative value is an escape value. */  value is a character, a negative value is an escape value. */
2307    
2308  if (*ptr == '\\')  if (*ptr == CHAR_BACKSLASH)
2309    {    {
2310    int temperrorcode = 0;    int temperrorcode = 0;
2311    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
# Line 2055  if ((options & PCRE_EXTENDED) != 0) Line 2330  if ((options & PCRE_EXTENDED) != 0)
2330    for (;;)    for (;;)
2331      {      {
2332      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2333      if (*ptr == '#')      if (*ptr == CHAR_NUMBER_SIGN)
2334        {        {
2335        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2336          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
# Line 2066  if ((options & PCRE_EXTENDED) != 0) Line 2341  if ((options & PCRE_EXTENDED) != 0)
2341    
2342  /* If the next thing is itself optional, we have to give up. */  /* If the next thing is itself optional, we have to give up. */
2343    
2344  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2345    return FALSE;    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2346        return FALSE;
2347    
2348  /* Now compare the next item with the previous opcode. If the previous is a  /* Now compare the next item with the previous opcode. If the previous is a
2349  positive single character match, "item" either contains the character or, if  positive single character match, "item" either contains the character or, if
# Line 2082  if (next >= 0) switch(op_code) Line 2358  if (next >= 0) switch(op_code)
2358    case OP_CHAR:    case OP_CHAR:
2359  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2360    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2361    #else
2362      (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2363  #endif  #endif
2364    return item != next;    return item != next;
2365    
# Line 2100  if (next >= 0) switch(op_code) Line 2378  if (next >= 0) switch(op_code)
2378      unsigned int othercase;      unsigned int othercase;
2379      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2380  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2381      othercase = _pcre_ucp_othercase((unsigned int)next);      othercase = UCD_OTHERCASE((unsigned int)next);
2382  #else  #else
2383      othercase = NOTACHAR;      othercase = NOTACHAR;
2384  #endif  #endif
# Line 2121  if (next >= 0) switch(op_code) Line 2399  if (next >= 0) switch(op_code)
2399      unsigned int othercase;      unsigned int othercase;
2400      if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
2401  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2402      othercase = _pcre_ucp_othercase(next);      othercase = UCD_OTHERCASE(next);
2403  #else  #else
2404      othercase = NOTACHAR;      othercase = NOTACHAR;
2405  #endif  #endif
# Line 2382  BOOL utf8 = FALSE; Line 2660  BOOL utf8 = FALSE;
2660  uschar *utf8_char = NULL;  uschar *utf8_char = NULL;
2661  #endif  #endif
2662    
2663  #ifdef DEBUG  #ifdef PCRE_DEBUG
2664  if (lengthptr != NULL) DPRINTF((">> start branch\n"));  if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2665  #endif  #endif
2666    
# Line 2441  for (;; ptr++) Line 2719  for (;; ptr++)
2719    
2720    if (lengthptr != NULL)    if (lengthptr != NULL)
2721      {      {
2722  #ifdef DEBUG  #ifdef PCRE_DEBUG
2723      if (code > cd->hwm) cd->hwm = code;                 /* High water info */      if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2724  #endif  #endif
2725      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */      if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
# Line 2503  for (;; ptr++) Line 2781  for (;; ptr++)
2781    
2782    if (inescq && c != 0)    if (inescq && c != 0)
2783      {      {
2784      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2785        {        {
2786        inescq = FALSE;        inescq = FALSE;
2787        ptr++;        ptr++;
# Line 2529  for (;; ptr++) Line 2807  for (;; ptr++)
2807    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2808    a quantifier. */    a quantifier. */
2809    
2810    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
2811      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2812        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2813    
2814    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2815         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
# Line 2545  for (;; ptr++) Line 2824  for (;; ptr++)
2824    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2825      {      {
2826      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2827      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
2828        {        {
2829        while (*(++ptr) != 0)        while (*(++ptr) != 0)
2830          {          {
# Line 2570  for (;; ptr++) Line 2849  for (;; ptr++)
2849      {      {
2850      /* ===================================================================*/      /* ===================================================================*/
2851      case 0:                        /* The branch terminates at string end */      case 0:                        /* The branch terminates at string end */
2852      case '|':                      /* or | or ) */      case CHAR_VERTICAL_LINE:       /* or | or ) */
2853      case ')':      case CHAR_RIGHT_PARENTHESIS:
2854      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2855      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2856      *codeptr = code;      *codeptr = code;
# Line 2593  for (;; ptr++) Line 2872  for (;; ptr++)
2872      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2873      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2874    
2875      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
2876      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
2877        {        {
2878        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
# Line 2602  for (;; ptr++) Line 2881  for (;; ptr++)
2881      *code++ = OP_CIRC;      *code++ = OP_CIRC;
2882      break;      break;
2883    
2884      case '$':      case CHAR_DOLLAR_SIGN:
2885      previous = NULL;      previous = NULL;
2886      *code++ = OP_DOLL;      *code++ = OP_DOLL;
2887      break;      break;
# Line 2610  for (;; ptr++) Line 2889  for (;; ptr++)
2889      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
2890      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
2891    
2892      case '.':      case CHAR_DOT:
2893      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2894      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2895      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2896      previous = code;      previous = code;
2897      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2898      break;      break;
2899    
2900    
# Line 2630  for (;; ptr++) Line 2909  for (;; ptr++)
2909      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2910      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2911      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
2912    
2913      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
2914        default (Perl) mode, it is treated as a data character. */
2915    
2916        case CHAR_RIGHT_SQUARE_BRACKET:
2917        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2918          {
2919          *errorcodeptr = ERR64;
2920          goto FAILED;
2921          }
2922        goto NORMAL_CHAR;
2923    
2924        case CHAR_LEFT_SQUARE_BRACKET:
2925      previous = code;      previous = code;
2926    
2927      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2928      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2929    
2930      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2931             ptr[1] == CHAR_EQUALS_SIGN) &&
2932          check_posix_syntax(ptr, &tempptr))          check_posix_syntax(ptr, &tempptr))
2933        {        {
2934        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2935        goto FAILED;        goto FAILED;
2936        }        }
2937    
# Line 2653  for (;; ptr++) Line 2943  for (;; ptr++)
2943      for (;;)      for (;;)
2944        {        {
2945        c = *(++ptr);        c = *(++ptr);
2946        if (c == '\\')        if (c == CHAR_BACKSLASH)
2947          {          {
2948          if (ptr[1] == 'E') ptr++;          if (ptr[1] == CHAR_E)
2949            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;            ptr++;
2950              else break;          else if (strncmp((const char *)ptr+1,
2951                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
2952              ptr += 3;
2953            else
2954              break;
2955          }          }
2956        else if (!negate_class && c == '^')        else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2957          negate_class = TRUE;          negate_class = TRUE;
2958        else break;        else break;
2959        }        }
2960    
2961        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2962        an initial ']' is taken as a data character -- the code below handles
2963        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2964        [^] must match any character, so generate OP_ALLANY. */
2965    
2966        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2967            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2968          {
2969          *code++ = negate_class? OP_ALLANY : OP_FAIL;
2970          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2971          zerofirstbyte = firstbyte;
2972          break;
2973          }
2974    
2975      /* If a class contains a negative special such as \S, we need to flip the      /* If a class contains a negative special such as \S, we need to flip the
2976      negation flag at the end, so that support for characters > 255 works      negation flag at the end, so that support for characters > 255 works
2977      correctly (they are all included in the class). */      correctly (they are all included in the class). */
# Line 2721  for (;; ptr++) Line 3029  for (;; ptr++)
3029    
3030        if (inescq)        if (inescq)
3031          {          {
3032          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3033            {            {
3034            inescq = FALSE;                   /* Reset literal state */            inescq = FALSE;                   /* Reset literal state */
3035            ptr++;                            /* Skip the 'E' */            ptr++;                            /* Skip the 'E' */
# Line 2736  for (;; ptr++) Line 3044  for (;; ptr++)
3044        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3045        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3046    
3047        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3048            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3049            check_posix_syntax(ptr, &tempptr))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3050          {          {
3051          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3052          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3053          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
3054          uschar pbits[32];          uschar pbits[32];
3055    
3056          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3057            {            {
3058            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3059            goto FAILED;            goto FAILED;
3060            }            }
3061    
3062          ptr += 2;          ptr += 2;
3063          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3064            {            {
3065            local_negate = TRUE;            local_negate = TRUE;
3066            should_flip_negation = TRUE;  /* Note negative special */            should_flip_negation = TRUE;  /* Note negative special */
# Line 2825  for (;; ptr++) Line 3133  for (;; ptr++)
3133        to 'or' into the one we are building. We assume they have more than one        to 'or' into the one we are building. We assume they have more than one
3134        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
3135    
3136        if (c == '\\')        if (c == CHAR_BACKSLASH)
3137          {          {
3138          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3139          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3140    
3141          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3142          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3143          else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */          else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3144          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3145            {            {
3146            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3147              {              {
3148              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3149              }              }
# Line 3061  for (;; ptr++) Line 3369  for (;; ptr++)
3369        entirely. The code for handling \Q and \E is messy. */        entirely. The code for handling \Q and \E is messy. */
3370    
3371        CHECK_RANGE:        CHECK_RANGE:
3372        while (ptr[1] == '\\' && ptr[2] == 'E')        while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3373          {          {
3374          inescq = FALSE;          inescq = FALSE;
3375          ptr += 2;          ptr += 2;
# Line 3071  for (;; ptr++) Line 3379  for (;; ptr++)
3379    
3380        /* Remember \r or \n */        /* Remember \r or \n */
3381    
3382        if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;        if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3383    
3384        /* Check for range */        /* Check for range */
3385    
3386        if (!inescq && ptr[1] == '-')        if (!inescq && ptr[1] == CHAR_MINUS)
3387          {          {
3388          int d;          int d;
3389          ptr += 2;          ptr += 2;
3390          while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3391    
3392          /* If we hit \Q (not followed by \E) at this point, go into escaped          /* If we hit \Q (not followed by \E) at this point, go into escaped
3393          mode. */          mode. */
3394    
3395          while (*ptr == '\\' && ptr[1] == 'Q')          while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3396            {            {
3397            ptr += 2;            ptr += 2;
3398            if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }            if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3399                { ptr += 2; continue; }
3400            inescq = TRUE;            inescq = TRUE;
3401            break;            break;
3402            }            }
3403    
3404          if (*ptr == 0 || (!inescq && *ptr == ']'))          if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3405            {            {
3406            ptr = oldptr;            ptr = oldptr;
3407            goto LONE_SINGLE_CHARACTER;            goto LONE_SINGLE_CHARACTER;
# Line 3111  for (;; ptr++) Line 3420  for (;; ptr++)
3420          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3421          in such circumstances. */          in such circumstances. */
3422    
3423          if (!inescq && d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
3424            {            {
3425            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3426            if (*errorcodeptr != 0) goto FAILED;            if (*errorcodeptr != 0) goto FAILED;
# Line 3121  for (;; ptr++) Line 3430  for (;; ptr++)
3430    
3431            if (d < 0)            if (d < 0)
3432              {              {
3433              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS;
3434              else if (d == -ESC_X) d = 'X';              else if (d == -ESC_X) d = CHAR_X;
3435              else if (d == -ESC_R) d = 'R'; else              else if (d == -ESC_R) d = CHAR_R; else
3436                {                {
3437                ptr = oldptr;                ptr = oldptr;
3438                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
# Line 3144  for (;; ptr++) Line 3453  for (;; ptr++)
3453    
3454          /* Remember \r or \n */          /* Remember \r or \n */
3455    
3456          if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;          if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3457    
3458          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3459          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
# Line 3264  for (;; ptr++) Line 3573  for (;; ptr++)
3573          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3574            {            {
3575            unsigned int othercase;            unsigned int othercase;
3576            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)            if ((othercase = UCD_OTHERCASE(c)) != c)
3577              {              {
3578              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3579              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 3291  for (;; ptr++) Line 3600  for (;; ptr++)
3600    
3601      /* Loop until ']' reached. This "while" is the end of the "do" above. */      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3602    
3603      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));      while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3604    
3605      if (c == 0)                          /* Missing terminating ']' */      if (c == 0)                          /* Missing terminating ']' */
3606        {        {
# Line 3436  we set the flag only if there is a liter Line 3745  we set the flag only if there is a liter
3745      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3746      has been tested above. */      has been tested above. */
3747    
3748      case '{':      case CHAR_LEFT_CURLY_BRACKET:
3749      if (!is_quantifier) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
3750      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3751      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
3752      goto REPEAT;      goto REPEAT;
3753    
3754      case '*':      case CHAR_ASTERISK:
3755      repeat_min = 0;      repeat_min = 0;
3756      repeat_max = -1;      repeat_max = -1;
3757      goto REPEAT;      goto REPEAT;
3758    
3759      case '+':      case CHAR_PLUS:
3760      repeat_min = 1;      repeat_min = 1;
3761      repeat_max = -1;      repeat_max = -1;
3762      goto REPEAT;      goto REPEAT;
3763    
3764      case '?':      case CHAR_QUESTION_MARK:
3765      repeat_min = 0;      repeat_min = 0;
3766      repeat_max = 1;      repeat_max = 1;
3767    
# Line 3487  we set the flag only if there is a liter Line 3796  we set the flag only if there is a liter
3796      but if PCRE_UNGREEDY is set, it works the other way round. We change the      but if PCRE_UNGREEDY is set, it works the other way round. We change the
3797      repeat type to the non-default. */      repeat type to the non-default. */
3798    
3799      if (ptr[1] == '+')      if (ptr[1] == CHAR_PLUS)
3800        {        {
3801        repeat_type = 0;                  /* Force greedy */        repeat_type = 0;                  /* Force greedy */
3802        possessive_quantifier = TRUE;        possessive_quantifier = TRUE;
3803        ptr++;        ptr++;
3804        }        }
3805      else if (ptr[1] == '?')      else if (ptr[1] == CHAR_QUESTION_MARK)
3806        {        {
3807        repeat_type = greedy_non_default;        repeat_type = greedy_non_default;
3808        ptr++;        ptr++;
# Line 3608  we set the flag only if there is a liter Line 3917  we set the flag only if there is a liter
3917    
3918        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3919    
3920          /*--------------------------------------------------------------------*/
3921          /* This code is obsolete from release 8.00; the restriction was finally
3922          removed: */
3923    
3924        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3925        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3926    
3927        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3928          /*--------------------------------------------------------------------*/
3929    
3930        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3931    
# Line 3758  we set the flag only if there is a liter Line 4072  we set the flag only if there is a liter
4072          goto END_REPEAT;          goto END_REPEAT;
4073          }          }
4074    
4075          /*--------------------------------------------------------------------*/
4076          /* This code is obsolete from release 8.00; the restriction was finally
4077          removed: */
4078    
4079        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4080        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4081    
4082        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4083          /*--------------------------------------------------------------------*/
4084    
4085        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4086          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 3819  we set the flag only if there is a liter Line 4138  we set the flag only if there is a liter
4138    
4139        if (repeat_min == 0)        if (repeat_min == 0)
4140          {          {
4141          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
4142          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
4143    
4144          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
4145          BRAZERO and do no more at this point. However, we do need to adjust          **   {
4146          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
4147          any internal or forward referenced group, because the offset is from          **   goto END_REPEAT;
4148          the start of the whole regex. Temporarily terminate the pattern while          **   }
4149          doing this. */  
4150            However, that fails when a group is referenced as a subroutine from
4151            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4152            so that it is skipped on execution. As we don't have a list of which
4153            groups are referenced, we cannot do this selectively.
4154    
4155            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4156            and do no more at this point. However, we do need to adjust any
4157            OP_RECURSE calls inside the group that refer to the group itself or any
4158            internal or forward referenced group, because the offset is from the
4159            start of the whole regex. Temporarily terminate the pattern while doing
4160            this. */
4161    
4162          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4163            {            {
4164            *code = OP_END;            *code = OP_END;
4165            adjust_recurse(previous, 1, utf8, cd, save_hwm);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
4166            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
4167            code++;            code++;
4168              if (repeat_max == 0)
4169                {
4170                *previous++ = OP_SKIPZERO;
4171                goto END_REPEAT;
4172                }
4173            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4174            }            }
4175    
# Line 3886  we set the flag only if there is a liter Line 4215  we set the flag only if there is a liter
4215            {            {
4216            /* In the pre-compile phase, we don't actually do the replication. We            /* In the pre-compile phase, we don't actually do the replication. We
4217            just adjust the length as if we had. Do some paranoid checks for            just adjust the length as if we had. Do some paranoid checks for
4218            potential integer overflow. */            potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4219              integer type when available, otherwise double. */
4220    
4221            if (lengthptr != NULL)            if (lengthptr != NULL)
4222              {              {
4223              int delta = (repeat_min - 1)*length_prevgroup;              int delta = (repeat_min - 1)*length_prevgroup;
4224              if ((double)(repeat_min - 1)*(double)length_prevgroup >              if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4225                                                              (double)INT_MAX ||                    (INT64_OR_DOUBLE)length_prevgroup >
4226                        (INT64_OR_DOUBLE)INT_MAX ||
4227                  OFLOW_MAX - *lengthptr < delta)                  OFLOW_MAX - *lengthptr < delta)
4228                {                {
4229                *errorcodeptr = ERR20;                *errorcodeptr = ERR20;
# Line 3938  we set the flag only if there is a liter Line 4269  we set the flag only if there is a liter
4269          just adjust the length as if we had. For each repetition we must add 1          just adjust the length as if we had. For each repetition we must add 1
4270          to the length for BRAZERO and for all but the last repetition we must          to the length for BRAZERO and for all but the last repetition we must
4271          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some          add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4272          paranoid checks to avoid integer overflow. */          paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4273            a 64-bit integer type when available, otherwise double. */
4274    
4275          if (lengthptr != NULL && repeat_max > 0)          if (lengthptr != NULL && repeat_max > 0)
4276            {            {
4277            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -            int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4278                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */                        2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4279            if ((double)repeat_max *            if ((INT64_OR_DOUBLE)repeat_max *
4280                  (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)                  (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4281                    > (double)INT_MAX ||                    > (INT64_OR_DOUBLE)INT_MAX ||
4282                OFLOW_MAX - *lengthptr < delta)                OFLOW_MAX - *lengthptr < delta)
4283              {              {
4284              *errorcodeptr = ERR20;              *errorcodeptr = ERR20;
# Line 4035  we set the flag only if there is a liter Line 4367  we set the flag only if there is a liter
4367          }          }
4368        }        }
4369    
4370        /* If previous is OP_FAIL, it was generated by an empty class [] in
4371        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4372        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4373        error above. We can just ignore the repeat in JS case. */
4374    
4375        else if (*previous == OP_FAIL) goto END_REPEAT;
4376    
4377      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4378    
4379      else      else
# Line 4059  we set the flag only if there is a liter Line 4398  we set the flag only if there is a liter
4398      if (possessive_quantifier)      if (possessive_quantifier)
4399        {        {
4400        int len;        int len;
4401        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||  
4402            *tempcode == OP_NOTEXACT)        if (*tempcode == OP_TYPEEXACT)
4403          tempcode += _pcre_OP_lengths[*tempcode] +          tempcode += _pcre_OP_lengths[*tempcode] +
4404            ((*tempcode == OP_TYPEEXACT &&            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4405               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);  
4406          else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4407            {
4408            tempcode += _pcre_OP_lengths[*tempcode];
4409    #ifdef SUPPORT_UTF8
4410            if (utf8 && tempcode[-1] >= 0xc0)
4411              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4412    #endif
4413            }
4414    
4415        len = code - tempcode;        len = code - tempcode;
4416        if (len > 0) switch (*tempcode)        if (len > 0) switch (*tempcode)
4417          {          {
# Line 4082  we set the flag only if there is a liter Line 4430  we set the flag only if there is a liter
4430          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;          case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4431          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;          case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4432    
4433            /* Because we are moving code along, we must ensure that any
4434            pending recursive references are updated. */
4435    
4436          default:          default:
4437            *code = OP_END;
4438            adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4439          memmove(tempcode + 1+LINK_SIZE, tempcode, len);          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4440          code += 1 + LINK_SIZE;          code += 1 + LINK_SIZE;
4441          len += 1 + LINK_SIZE;          len += 1 + LINK_SIZE;
# Line 4109  we set the flag only if there is a liter Line 4462  we set the flag only if there is a liter
4462      lookbehind or option setting or condition or all the other extended      lookbehind or option setting or condition or all the other extended
4463      parenthesis forms.  */      parenthesis forms.  */
4464    
4465      case '(':      case CHAR_LEFT_PARENTHESIS:
4466      newoptions = options;      newoptions = options;
4467      skipbytes = 0;      skipbytes = 0;
4468      bravalue = OP_CBRA;      bravalue = OP_CBRA;
# Line 4118  we set the flag only if there is a liter Line 4471  we set the flag only if there is a liter
4471    
4472      /* First deal with various "verbs" that can be introduced by '*'. */      /* First deal with various "verbs" that can be introduced by '*'. */
4473    
4474      if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)      if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4475        {        {
4476        int i, namelen;        int i, namelen;
4477        const char *vn = verbnames;        const char *vn = verbnames;
4478        const uschar *name = ++ptr;        const uschar *name = ++ptr;
4479        previous = NULL;        previous = NULL;
4480        while ((cd->ctypes[*++ptr] & ctype_letter) != 0);        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4481        if (*ptr == ':')        if (*ptr == CHAR_COLON)
4482          {          {
4483          *errorcodeptr = ERR59;   /* Not supported */          *errorcodeptr = ERR59;   /* Not supported */
4484          goto FAILED;          goto FAILED;
4485          }          }
4486        if (*ptr != ')')        if (*ptr != CHAR_RIGHT_PARENTHESIS)
4487          {          {
4488          *errorcodeptr = ERR60;          *errorcodeptr = ERR60;
4489          goto FAILED;          goto FAILED;
# Line 4141  we set the flag only if there is a liter Line 4494  we set the flag only if there is a liter
4494          if (namelen == verbs[i].len &&          if (namelen == verbs[i].len &&
4495              strncmp((char *)name, vn, namelen) == 0)              strncmp((char *)name, vn, namelen) == 0)
4496            {            {
4497            *code = verbs[i].op;            /* Check for open captures before ACCEPT */
4498            if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;  
4499              if (verbs[i].op == OP_ACCEPT)
4500                {
4501                open_capitem *oc;
4502                cd->had_accept = TRUE;
4503                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4504                  {
4505                  *code++ = OP_CLOSE;
4506                  PUT2INC(code, 0, oc->number);
4507                  }
4508                }
4509              *code++ = verbs[i].op;
4510            break;            break;
4511            }            }
4512          vn += verbs[i].len + 1;          vn += verbs[i].len + 1;
# Line 4155  we set the flag only if there is a liter Line 4519  we set the flag only if there is a liter
4519      /* Deal with the extended parentheses; all are introduced by '?', and the      /* Deal with the extended parentheses; all are introduced by '?', and the
4520      appearance of any of them means that this is not a capturing group. */      appearance of any of them means that this is not a capturing group. */
4521    
4522      else if (*ptr == '?')      else if (*ptr == CHAR_QUESTION_MARK)
4523        {        {
4524        int i, set, unset, namelen;        int i, set, unset, namelen;
4525        int *optset;        int *optset;
# Line 4164  we set the flag only if there is a liter Line 4528  we set the flag only if there is a liter
4528    
4529        switch (*(++ptr))        switch (*(++ptr))
4530          {          {
4531          case '#':                 /* Comment; skip to ket */          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4532          ptr++;          ptr++;
4533          while (*ptr != 0 && *ptr != ')') ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4534          if (*ptr == 0)          if (*ptr == 0)
4535            {            {
4536            *errorcodeptr = ERR18;            *errorcodeptr = ERR18;
# Line 4176  we set the flag only if there is a liter Line 4540  we set the flag only if there is a liter
4540    
4541    
4542          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4543          case '|':                 /* Reset capture count for each branch */          case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4544          reset_bracount = TRUE;          reset_bracount = TRUE;
4545          /* Fall through */          /* Fall through */
4546    
4547          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4548          case ':':                 /* Non-capturing bracket */          case CHAR_COLON:          /* Non-capturing bracket */
4549          bravalue = OP_BRA;          bravalue = OP_BRA;
4550          ptr++;          ptr++;
4551          break;          break;
4552    
4553    
4554          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4555          case '(':          case CHAR_LEFT_PARENTHESIS:
4556          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4557    
4558          /* A condition can be an assertion, a number (referring to a numbered          /* A condition can be an assertion, a number (referring to a numbered
# Line 4208  we set the flag only if there is a liter Line 4572  we set the flag only if there is a liter
4572          the switch. This will take control down to where bracketed groups,          the switch. This will take control down to where bracketed groups,
4573          including assertions, are processed. */          including assertions, are processed. */
4574    
4575          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))          if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4576                ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4577            break;            break;
4578    
4579          /* Most other conditions use OP_CREF (a couple change to OP_RREF          /* Most other conditions use OP_CREF (a couple change to OP_RREF
# Line 4220  we set the flag only if there is a liter Line 4585  we set the flag only if there is a liter
4585    
4586          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
4587    
4588          if (ptr[1] == 'R' && ptr[2] == '&')          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4589            {            {
4590            terminator = -1;            terminator = -1;
4591            ptr += 2;            ptr += 2;
# Line 4230  we set the flag only if there is a liter Line 4595  we set the flag only if there is a liter
4595          /* Check for a test for a named group's having been set, using the Perl          /* Check for a test for a named group's having been set, using the Perl
4596          syntax (?(<name>) or (?('name') */          syntax (?(<name>) or (?('name') */
4597    
4598          else if (ptr[1] == '<')          else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4599            {            {
4600            terminator = '>';            terminator = CHAR_GREATER_THAN_SIGN;
4601            ptr++;            ptr++;
4602            }            }
4603          else if (ptr[1] == '\'')          else if (ptr[1] == CHAR_APOSTROPHE)
4604            {            {
4605            terminator = '\'';            terminator = CHAR_APOSTROPHE;
4606            ptr++;            ptr++;
4607            }            }
4608          else          else
4609            {            {
4610            terminator = 0;            terminator = 0;
4611            if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);            if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4612            }            }
4613    
4614          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
# Line 4263  we set the flag only if there is a liter Line 4628  we set the flag only if there is a liter
4628            {            {
4629            if (recno >= 0)            if (recno >= 0)
4630              recno = ((digitab[*ptr] & ctype_digit) != 0)?              recno = ((digitab[*ptr] & ctype_digit) != 0)?
4631                recno * 10 + *ptr - '0' : -1;                recno * 10 + *ptr - CHAR_0 : -1;
4632            ptr++;            ptr++;
4633            }            }
4634          namelen = ptr - name;          namelen = ptr - name;
4635    
4636          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')          if ((terminator > 0 && *ptr++ != terminator) ||
4637                *ptr++ != CHAR_RIGHT_PARENTHESIS)
4638            {            {
4639            ptr--;      /* Error offset */            ptr--;      /* Error offset */
4640            *errorcodeptr = ERR26;            *errorcodeptr = ERR26;
# Line 4290  we set the flag only if there is a liter Line 4656  we set the flag only if there is a liter
4656              *errorcodeptr = ERR58;              *errorcodeptr = ERR58;
4657              goto FAILED;              goto FAILED;
4658              }              }
4659            recno = (refsign == '-')?            recno = (refsign == CHAR_MINUS)?
4660              cd->bracount - recno + 1 : recno +cd->bracount;              cd->bracount - recno + 1 : recno +cd->bracount;
4661            if (recno <= 0 || recno > cd->final_bracount)            if (recno <= 0 || recno > cd->final_bracount)
4662              {              {
# Line 4302  we set the flag only if there is a liter Line 4668  we set the flag only if there is a liter
4668            }            }
4669    
4670          /* Otherwise (did not start with "+" or "-"), start by looking for the          /* Otherwise (did not start with "+" or "-"), start by looking for the
4671          name. */          name. If we find a name, add one to the opcode to change OP_CREF or
4672            OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4673            except they record that the reference was originally to a name. The
4674            information is used to check duplicate names. */
4675    
4676          slot = cd->name_table;          slot = cd->name_table;
4677          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
# Line 4317  we set the flag only if there is a liter Line 4686  we set the flag only if there is a liter
4686            {            {
4687            recno = GET2(slot, 0);            recno = GET2(slot, 0);
4688            PUT2(code, 2+LINK_SIZE, recno);            PUT2(code, 2+LINK_SIZE, recno);
4689              code[1+LINK_SIZE]++;
4690            }            }
4691    
4692          /* Search the pattern for a forward reference */          /* Search the pattern for a forward reference */
4693    
4694          else if ((i = find_parens(ptr, cd->bracount, name, namelen,          else if ((i = find_parens(cd, name, namelen,
4695                          (options & PCRE_EXTENDED) != 0)) > 0)                          (options & PCRE_EXTENDED) != 0)) > 0)
4696            {            {
4697            PUT2(code, 2+LINK_SIZE, i);            PUT2(code, 2+LINK_SIZE, i);
4698              code[1+LINK_SIZE]++;
4699            }            }
4700    
4701          /* If terminator == 0 it means that the name followed directly after          /* If terminator == 0 it means that the name followed directly after
# Line 4342  we set the flag only if there is a liter Line 4713  we set the flag only if there is a liter
4713          /* Check for (?(R) for recursion. Allow digits after R to specify a          /* Check for (?(R) for recursion. Allow digits after R to specify a
4714          specific group number. */          specific group number. */
4715    
4716          else if (*name == 'R')          else if (*name == CHAR_R)
4717            {            {
4718            recno = 0;            recno = 0;
4719            for (i = 1; i < namelen; i++)            for (i = 1; i < namelen; i++)
# Line 4352  we set the flag only if there is a liter Line 4723  we set the flag only if there is a liter
4723                *errorcodeptr = ERR15;                *errorcodeptr = ERR15;
4724                goto FAILED;                goto FAILED;
4725                }                }
4726              recno = recno * 10 + name[i] - '0';              recno = recno * 10 + name[i] - CHAR_0;
4727              }              }
4728            if (recno == 0) recno = RREF_ANY;            if (recno == 0) recno = RREF_ANY;
4729            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */            code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
# Line 4362  we set the flag only if there is a liter Line 4733  we set the flag only if there is a liter
4733          /* Similarly, check for the (?(DEFINE) "condition", which is always          /* Similarly, check for the (?(DEFINE) "condition", which is always
4734          false. */          false. */
4735    
4736          else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)          else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4737            {            {
4738            code[1+LINK_SIZE] = OP_DEF;            code[1+LINK_SIZE] = OP_DEF;
4739            skipbytes = 1;            skipbytes = 1;
# Line 4387  we set the flag only if there is a liter Line 4758  we set the flag only if there is a liter
4758    
4759    
4760          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4761          case '=':                 /* Positive lookahead */          case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
4762          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
4763          ptr++;          ptr++;
4764          break;          break;
4765    
4766    
4767          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4768          case '!':                 /* Negative lookahead */          case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
4769          ptr++;          ptr++;
4770          if (*ptr == ')')          /* Optimize (?!) */          if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
4771            {            {
4772            *code++ = OP_FAIL;            *code++ = OP_FAIL;
4773            previous = NULL;            previous = NULL;
# Line 4407  we set the flag only if there is a liter Line 4778  we set the flag only if there is a liter
4778    
4779    
4780          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4781          case '<':                 /* Lookbehind or named define */          case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
4782          switch (ptr[1])          switch (ptr[1])
4783            {            {
4784            case '=':               /* Positive lookbehind */            case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
4785            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
4786            ptr += 2;            ptr += 2;
4787            break;            break;
4788    
4789            case '!':               /* Negative lookbehind */            case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
4790            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
4791            ptr += 2;            ptr += 2;
4792            break;            break;
# Line 4430  we set the flag only if there is a liter Line 4801  we set the flag only if there is a liter
4801    
4802    
4803          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4804          case '>':                 /* One-time brackets */          case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
4805          bravalue = OP_ONCE;          bravalue = OP_ONCE;
4806          ptr++;          ptr++;
4807          break;          break;
4808    
4809    
4810          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4811          case 'C':                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
4812          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4813          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4814          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
4815            {            {
4816            int n = 0;            int n = 0;
4817            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4818              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - CHAR_0;
4819            if (*ptr != ')')            if (*ptr != CHAR_RIGHT_PARENTHESIS)
4820              {              {
4821              *errorcodeptr = ERR39;              *errorcodeptr = ERR39;
4822              goto FAILED;              goto FAILED;
# Line 4465  we set the flag only if there is a liter Line 4836  we set the flag only if there is a liter
4836    
4837    
4838          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4839          case 'P':                 /* Python-style named subpattern handling */          case CHAR_P:              /* Python-style named subpattern handling */
4840          if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */          if (*(++ptr) == CHAR_EQUALS_SIGN ||
4841                *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4842            {            {
4843            is_recurse = *ptr == '>';            is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4844            terminator = ')';            terminator = CHAR_RIGHT_PARENTHESIS;
4845            goto NAMED_REF_OR_RECURSE;            goto NAMED_REF_OR_RECURSE;
4846            }            }
4847          else if (*ptr != '<')    /* Test for Python-style definition */          else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
4848            {            {
4849            *errorcodeptr = ERR41;            *errorcodeptr = ERR41;
4850            goto FAILED;            goto FAILED;
# Line 4482  we set the flag only if there is a liter Line 4854  we set the flag only if there is a liter
4854    
4855          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4856          DEFINE_NAME:    /* Come here from (?< handling */          DEFINE_NAME:    /* Come here from (?< handling */
4857          case '\'':          case CHAR_APOSTROPHE:
4858            {            {
4859            terminator = (*ptr == '<')? '>' : '\'';            terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4860                CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4861            name = ++ptr;            name = ++ptr;
4862    
4863            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
# Line 4515  we set the flag only if there is a liter Line 4888  we set the flag only if there is a liter
4888                }                }
4889              }              }
4890    
4891            /* In the real compile, create the entry in the table */            /* In the real compile, create the entry in the table, maintaining
4892              alphabetical order. Duplicate names for different numbers are
4893              permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4894              number are always OK. (An existing number can be re-used if (?|
4895              appears in the pattern.) In either event, a duplicate name results in
4896              a duplicate entry in the table, even if the number is the same. This
4897              is because the number of names, and hence the table size, is computed
4898              in the pre-compile, and it affects various numbers and pointers which
4899              would all have to be modified, and the compiled code moved down, if
4900              duplicates with the same number were omitted from the table. This
4901              doesn't seem worth the hassle. However, *different* names for the
4902              same number are not permitted. */
4903    
4904            else            else
4905              {              {
4906                BOOL dupname = FALSE;
4907              slot = cd->name_table;              slot = cd->name_table;
4908    
4909              for (i = 0; i < cd->names_found; i++)              for (i = 0; i < cd->names_found; i++)
4910                {                {
4911                int crc = memcmp(name, slot+2, namelen);                int crc = memcmp(name, slot+2, namelen);
# Line 4527  we set the flag only if there is a liter Line 4913  we set the flag only if there is a liter
4913                  {                  {
4914                  if (slot[2+namelen] == 0)                  if (slot[2+namelen] == 0)
4915                    {                    {
4916                    if ((options & PCRE_DUPNAMES) == 0)                    if (GET2(slot, 0) != cd->bracount + 1 &&
4917                          (options & PCRE_DUPNAMES) == 0)
4918                      {                      {
4919                      *errorcodeptr = ERR43;                      *errorcodeptr = ERR43;
4920                      goto FAILED;                      goto FAILED;
4921                      }                      }
4922                      else dupname = TRUE;
4923                    }                    }
4924                  else crc = -1;      /* Current name is substring */                  else crc = -1;      /* Current name is a substring */
4925                  }                  }
4926    
4927                  /* Make space in the table and break the loop for an earlier
4928                  name. For a duplicate or later name, carry on. We do this for
4929                  duplicates so that in the simple case (when ?(| is not used) they
4930                  are in order of their numbers. */
4931    
4932                if (crc < 0)                if (crc < 0)
4933                  {                  {
4934                  memmove(slot + cd->name_entry_size, slot,                  memmove(slot + cd->name_entry_size, slot,
4935                    (cd->names_found - i) * cd->name_entry_size);                    (cd->names_found - i) * cd->name_entry_size);
4936                  break;                  break;
4937                  }                  }
4938    
4939                  /* Continue the loop for a later or duplicate name */
4940    
4941                slot += cd->name_entry_size;                slot += cd->name_entry_size;
4942                }                }
4943    
4944                /* For non-duplicate names, check for a duplicate number before
4945                adding the new name. */
4946    
4947                if (!dupname)
4948                  {
4949                  uschar *cslot = cd->name_table;
4950                  for (i = 0; i < cd->names_found; i++)
4951                    {
4952                    if (cslot != slot)
4953                      {
4954                      if (GET2(cslot, 0) == cd->bracount + 1)
4955                        {
4956                        *errorcodeptr = ERR65;
4957                        goto FAILED;
4958                        }
4959                      }
4960                    else i--;
4961                    cslot += cd->name_entry_size;
4962                    }
4963                  }
4964    
4965              PUT2(slot, 0, cd->bracount + 1);              PUT2(slot, 0, cd->bracount + 1);
4966              memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4967              slot[2+namelen] = 0;              slot[2+namelen] = 0;
4968              }              }
4969            }            }
4970    
4971          /* In both cases, count the number of names we've encountered. */          /* In both pre-compile and compile, count the number of names we've
4972            encountered. */
4973    
         ptr++;                    /* Move past > or ' */  
4974          cd->names_found++;          cd->names_found++;
4975            ptr++;                    /* Move past > or ' */
4976          goto NUMBERED_GROUP;          goto NUMBERED_GROUP;
4977    
4978    
4979          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4980          case '&':                 /* Perl recursion/subroutine syntax */          case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
4981          terminator = ')';          terminator = CHAR_RIGHT_PARENTHESIS;
4982          is_recurse = TRUE;          is_recurse = TRUE;
4983          /* Fall through */          /* Fall through */
4984    
# Line 4567  we set the flag only if there is a liter Line 4986  we set the flag only if there is a liter
4986          references (?P=name) and recursion (?P>name), as well as falling          references (?P=name) and recursion (?P>name), as well as falling
4987          through from the Perl recursion syntax (?&name). We also come here from          through from the Perl recursion syntax (?&name). We also come here from
4988          the Perl \k<name> or \k'name' back reference syntax and the \k{name}          the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4989          .NET syntax. */          .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4990    
4991          NAMED_REF_OR_RECURSE:          NAMED_REF_OR_RECURSE:
4992          name = ++ptr;          name = ++ptr;
# Line 4618  we set the flag only if there is a liter Line 5037  we set the flag only if there is a liter
5037              recno = GET2(slot, 0);              recno = GET2(slot, 0);
5038              }              }
5039            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
5040                      find_parens(ptr, cd->bracount, name, namelen,                      find_parens(cd, name, namelen,
5041                        (options & PCRE_EXTENDED) != 0)) <= 0)                        (options & PCRE_EXTENDED) != 0)) <= 0)
5042              {              {
5043              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 4634  we set the flag only if there is a liter Line 5053  we set the flag only if there is a liter
5053    
5054    
5055          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5056          case 'R':                 /* Recursion */          case CHAR_R:              /* Recursion */
5057          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
5058          /* Fall through */          /* Fall through */
5059    
5060    
5061          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5062          case '-': case '+':          case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
5063          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5064          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5065            {            {
5066            const uschar *called;            const uschar *called;
5067              terminator = CHAR_RIGHT_PARENTHESIS;
5068    
5069              /* Come here from the \g<...> and \g'...' code (Oniguruma
5070              compatibility). However, the syntax has been checked to ensure that
5071              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5072              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5073              ever be taken. */
5074    
5075            if ((refsign = *ptr) == '+')            HANDLE_NUMERICAL_RECURSION:
5076    
5077              if ((refsign = *ptr) == CHAR_PLUS)
5078              {              {
5079              ptr++;              ptr++;
5080              if ((digitab[*ptr] & ctype_digit) == 0)              if ((digitab[*ptr] & ctype_digit) == 0)
# Line 4655  we set the flag only if there is a liter Line 5083  we set the flag only if there is a liter
5083                goto FAILED;                goto FAILED;
5084                }                }
5085              }              }
5086            else if (refsign == '-')            else if (refsign == CHAR_MINUS)
5087              {              {
5088              if ((digitab[ptr[1]] & ctype_digit) == 0)              if ((digitab[ptr[1]] & ctype_digit) == 0)
5089                goto OTHER_CHAR_AFTER_QUERY;                goto OTHER_CHAR_AFTER_QUERY;
# Line 4664  we set the flag only if there is a liter Line 5092  we set the flag only if there is a liter
5092    
5093            recno = 0;            recno = 0;
5094            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
5095              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - CHAR_0;
5096    
5097            if (*ptr != ')')            if (*ptr != terminator)
5098              {              {
5099              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
5100              goto FAILED;              goto FAILED;
5101              }              }
5102    
5103            if (refsign == '-')            if (refsign == CHAR_MINUS)
5104              {              {
5105              if (recno == 0)              if (recno == 0)
5106                {                {
# Line 4686  we set the flag only if there is a liter Line 5114  we set the flag only if there is a liter
5114                goto FAILED;                goto FAILED;
5115                }                }
5116              }              }
5117            else if (refsign == '+')            else if (refsign == CHAR_PLUS)
5118              {              {
5119              if (recno == 0)              if (recno == 0)
5120                {                {
# Line 4713  we set the flag only if there is a liter Line 5141  we set the flag only if there is a liter
5141            if (lengthptr == NULL)            if (lengthptr == NULL)
5142              {              {
5143              *code = OP_END;              *code = OP_END;
5144              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);              if (recno != 0)
5145                  called = _pcre_find_bracket(cd->start_code, utf8, recno);
5146    
5147              /* Forward reference */              /* Forward reference */
5148    
5149              if (called == NULL)              if (called == NULL)
5150                {                {
5151                if (find_parens(ptr, cd->bracount, NULL, recno,                if (find_parens(cd, NULL, recno,
5152                     (options & PCRE_EXTENDED) != 0) < 0)                      (options & PCRE_EXTENDED) != 0) < 0)
5153                  {                  {
5154                  *errorcodeptr = ERR15;                  *errorcodeptr = ERR15;
5155                  goto FAILED;                  goto FAILED;
5156                  }                  }
5157    
5158                  /* Fudge the value of "called" so that when it is inserted as an
5159                  offset below, what it actually inserted is the reference number
5160                  of the group. */
5161    
5162                called = cd->start_code + recno;                called = cd->start_code + recno;
5163                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);                PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5164                }                }
# Line 4772  we set the flag only if there is a liter Line 5206  we set the flag only if there is a liter
5206          set = unset = 0;          set = unset = 0;
5207          optset = &set;          optset = &set;
5208    
5209          while (*ptr != ')' && *ptr != ':')          while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5210            {            {
5211            switch (*ptr++)            switch (*ptr++)
5212              {              {
5213              case '-': optset = &unset; break;              case CHAR_MINUS: optset = &unset; break;
5214    
5215              case 'J':    /* Record that it changed in the external options */              case CHAR_J:    /* Record that it changed in the external options */
5216              *optset |= PCRE_DUPNAMES;              *optset |= PCRE_DUPNAMES;
5217              cd->external_flags |= PCRE_JCHANGED;              cd->external_flags |= PCRE_JCHANGED;
5218              break;              break;
5219    
5220              case 'i': *optset |= PCRE_CASELESS; break;              case CHAR_i: *optset |= PCRE_CASELESS; break;
5221              case 'm': *optset |= PCRE_MULTILINE; break;              case CHAR_m: *optset |= PCRE_MULTILINE; break;
5222              case 's': *optset |= PCRE_DOTALL; break;              case CHAR_s: *optset |= PCRE_DOTALL; break;
5223              case 'x': *optset |= PCRE_EXTENDED; break;              case CHAR_x: *optset |= PCRE_EXTENDED; break;
5224              case 'U': *optset |= PCRE_UNGREEDY; break;              case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5225              case 'X': *optset |= PCRE_EXTRA; break;              case CHAR_X: *optset |= PCRE_EXTRA; break;
5226    
5227              default:  *errorcodeptr = ERR12;              default:  *errorcodeptr = ERR12;
5228                        ptr--;    /* Correct the offset */                        ptr--;    /* Correct the offset */
# Line 4819  we set the flag only if there is a liter Line 5253  we set the flag only if there is a liter
5253          both phases.          both phases.
5254    
5255          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, compile code to change the ims
5256          options if this setting actually changes any of them. We also pass the          options if this setting actually changes any of them, and reset the
5257          new setting back so that it can be put at the start of any following          greedy defaults and the case value for firstbyte and reqbyte. */
         branches, and when this group ends (if we are in a group), a resetting  
         item can be compiled. */  
5258    
5259          if (*ptr == ')')          if (*ptr == CHAR_RIGHT_PARENTHESIS)
5260            {            {
5261            if (code == cd->start_code + 1 + LINK_SIZE &&            if (code == cd->start_code + 1 + LINK_SIZE &&
5262                 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))                 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5263              {              {
5264              cd->external_options = newoptions;              cd->external_options = newoptions;
             options = newoptions;  
5265              }              }
5266           else            else
5267              {              {
5268              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))              if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5269                {                {
5270                *code++ = OP_OPT;                *code++ = OP_OPT;
5271                *code++ = newoptions & PCRE_IMS;                *code++ = newoptions & PCRE_IMS;
5272                }                }
   
             /* Change options at this level, and pass them back for use  
             in subsequent branches. Reset the greedy defaults and the case  
             value for firstbyte and reqbyte. */  
   
             *optionsptr = options = newoptions;  
5273              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5274              greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
5275              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5276              }              }
5277    
5278              /* Change options at this level, and pass them back for use
5279              in subsequent branches. When not at the start of the pattern, this
5280              information is also necessary so that a resetting item can be
5281              compiled at the end of a group (if we are in a group). */
5282    
5283              *optionsptr = options = newoptions;
5284            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
5285            continue;              /* It is complete */            continue;              /* It is complete */
5286            }            }
# Line 4965  we set the flag only if there is a liter Line 5396  we set the flag only if there is a liter
5396    
5397      /* Error if hit end of pattern */      /* Error if hit end of pattern */
5398    
5399      if (*ptr != ')')      if (*ptr != CHAR_RIGHT_PARENTHESIS)
5400        {        {
5401        *errorcodeptr = ERR14;        *errorcodeptr = ERR14;
5402        goto FAILED;        goto FAILED;
# Line 5063  we set the flag only if there is a liter Line 5494  we set the flag only if there is a liter
5494      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
5495      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
5496    
5497      case '\\':      case CHAR_BACKSLASH:
5498      tempptr = ptr;      tempptr = ptr;
5499      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);      c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5500      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
# Line 5072  we set the flag only if there is a liter Line 5503  we set the flag only if there is a liter
5503        {        {
5504        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
5505          {          {
5506          if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5507            else inescq = TRUE;            ptr += 2;               /* avoid empty string */
5508                else inescq = TRUE;
5509          continue;          continue;
5510          }          }
5511    
# Line 5090  we set the flag only if there is a liter Line 5522  we set the flag only if there is a liter
5522        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
5523        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
5524    
5525          /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5526          is a subroutine call by number (Oniguruma syntax). In fact, the value
5527          -ESC_g is returned only for these cases. So we don't need to check for <
5528          or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5529          -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5530          that is a synonym for a named back reference). */
5531    
5532          if (-c == ESC_g)
5533            {
5534            const uschar *p;
5535            save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5536            terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5537              CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5538    
5539            /* These two statements stop the compiler for warning about possibly
5540            unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5541            fact, because we actually check for a number below, the paths that
5542            would actually be in error are never taken. */
5543    
5544            skipbytes = 0;
5545            reset_bracount = FALSE;
5546    
5547            /* Test for a name */
5548    
5549            if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5550              {
5551              BOOL isnumber = TRUE;
5552              for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5553                {
5554                if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5555                if ((cd->ctypes[*p] & ctype_word) == 0) break;
5556                }
5557              if (*p != terminator)
5558                {
5559                *errorcodeptr = ERR57;
5560                break;
5561                }
5562              if (isnumber)
5563                {
5564                ptr++;
5565                goto HANDLE_NUMERICAL_RECURSION;
5566                }
5567              is_recurse = TRUE;
5568              goto NAMED_REF_OR_RECURSE;
5569              }
5570    
5571            /* Test a signed number in angle brackets or quotes. */
5572    
5573            p = ptr + 2;
5574            while ((digitab[*p] & ctype_digit) != 0) p++;
5575            if (*p != terminator)
5576              {
5577              *errorcodeptr = ERR57;
5578              break;
5579              }
5580            ptr++;
5581            goto HANDLE_NUMERICAL_RECURSION;
5582            }
5583    
5584        /* \k<name> or \k'name' is a back reference by name (Perl syntax).        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5585        We also support \k{name} (.NET syntax) */        We also support \k{name} (.NET syntax) */
5586    
5587        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))        if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5588              ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5589          {          {
5590          is_recurse = FALSE;          is_recurse = FALSE;
5591          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';          terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5592              CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5593              CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5594          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
5595          }          }
5596    
# Line 5106  we set the flag only if there is a liter Line 5600  we set the flag only if there is a liter
5600    
5601        if (-c >= ESC_REF)        if (-c >= ESC_REF)
5602          {          {
5603            open_capitem *oc;
5604          recno = -c - ESC_REF;          recno = -c - ESC_REF;
5605    
5606          HANDLE_REFERENCE:    /* Come here from named backref handling */          HANDLE_REFERENCE:    /* Come here from named backref handling */
# Line 5115  we set the flag only if there is a liter Line 5610  we set the flag only if there is a liter
5610          PUT2INC(code, 0, recno);          PUT2INC(code, 0, recno);
5611          cd->backref_map |= (recno < 32)? (1 << recno) : 1;          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5612          if (recno > cd->top_backref) cd->top_backref = recno;          if (recno > cd->top_backref) cd->top_backref = recno;
5613    
5614            /* Check to see if this back reference is recursive, that it, it
5615            is inside the group that it references. A flag is set so that the
5616            group can be made atomic. */
5617    
5618            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5619              {
5620              if (oc->number == recno)
5621                {
5622                oc->flag = TRUE;
5623                break;
5624                }
5625              }
5626          }          }
5627    
5628        /* So are Unicode property matches, if supported. */        /* So are Unicode property matches, if supported. */
# Line 5199  we set the flag only if there is a liter Line 5707  we set the flag only if there is a liter
5707    
5708      /* Remember if \r or \n were seen */      /* Remember if \r or \n were seen */
5709    
5710      if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')      if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5711        cd->external_flags |= PCRE_HASCRORLF;        cd->external_flags |= PCRE_HASCRORLF;
5712    
5713      /* Set the first and required bytes appropriately. If no previous first      /* Set the first and required bytes appropriately. If no previous first
# Line 5297  uschar *code = *codeptr; Line 5805  uschar *code = *codeptr;
5805  uschar *last_branch = code;  uschar *last_branch = code;
5806  uschar *start_bracket = code;  uschar *start_bracket = code;
5807  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
5808    open_capitem capitem;
5809    int capnumber = 0;
5810  int firstbyte, reqbyte;  int firstbyte, reqbyte;
5811  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
5812  int length;  int length;
5813  int orig_bracount;  int orig_bracount;
5814  int max_bracount;  int max_bracount;
5815    int old_external_options = cd->external_options;
5816  branch_chain bc;  branch_chain bc;
5817    
5818  bc.outer = bcptr;  bc.outer = bcptr;
5819  bc.current = code;  bc.current_branch = code;
5820    
5821  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
5822    
# Line 5323  the code that abstracts option settings Line 5834  the code that abstracts option settings
5834  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the  them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5835  pre-compile phase to find out whether anything has yet been compiled or not. */  pre-compile phase to find out whether anything has yet been compiled or not. */
5836    
5837    /* If this is a capturing subpattern, add to the chain of open capturing items
5838    so that we can detect them if (*ACCEPT) is encountered. This is also used to
5839    detect groups that contain recursive back references to themselves. */
5840    
5841    if (*code == OP_CBRA)
5842      {
5843      capnumber = GET2(code, 1 + LINK_SIZE);
5844      capitem.number = capnumber;
5845      capitem.next = cd->open_caps;
5846      capitem.flag = FALSE;
5847      cd->open_caps = &capitem;
5848      }
5849    
5850  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
5851    
5852  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 5367  for (;;) Line 5891  for (;;)
5891      return FALSE;      return FALSE;
5892      }      }
5893    
5894      /* If the external options have changed during this branch, it means that we
5895      are at the top level, and a leading option setting has been encountered. We
5896      need to re-set the original option values to take account of this so that,
5897      during the pre-compile phase, we know to allow for a re-set at the start of
5898      subsequent branches. */
5899    
5900      if (old_external_options != cd->external_options)
5901        oldims = cd->external_options & PCRE_IMS;
5902    
5903    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
5904    has fewer than the rest. */    has fewer than the rest. */
5905    
# Line 5417  for (;;) Line 5950  for (;;)
5950    
5951      /* If lookbehind, check that this branch matches a fixed-length string, and      /* If lookbehind, check that this branch matches a fixed-length string, and
5952      put the length into the OP_REVERSE item. Temporarily mark the end of the      put the length into the OP_REVERSE item. Temporarily mark the end of the
5953      branch with OP_END. */      branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5954        because there may be forward references that we can't check here. Set a
5955        flag to cause another lookbehind check at the end. Why not do it all at the
5956        end? Because common, erroneous checks are picked up here and the offset of
5957        the problem can be shown. */
5958    
5959      if (lookbehind)      if (lookbehind)
5960        {        {
5961        int fixed_length;        int fixed_length;
5962        *code = OP_END;        *code = OP_END;
5963        fixed_length = find_fixedlength(last_branch, options);        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5964        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
5965        if (fixed_length < 0)        if (fixed_length == -3)
5966            {
5967            cd->check_lookbehind = TRUE;
5968            }
5969          else if (fixed_length < 0)
5970          {          {
5971          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;          *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5972          *ptrptr = ptr;          *ptrptr = ptr;
5973          return FALSE;          return FALSE;
5974          }          }
5975        PUT(reverse_count, 0, fixed_length);        else { PUT(reverse_count, 0, fixed_length); }
5976        }        }
5977      }      }
5978    
# Line 5444  for (;;) Line 5985  for (;;)
5985    compile a resetting op-code following, except at the very end of the pattern.    compile a resetting op-code following, except at the very end of the pattern.
5986    Return leaving the pointer at the terminating char. */    Return leaving the pointer at the terminating char. */
5987    
5988    if (*ptr != '|')    if (*ptr != CHAR_VERTICAL_LINE)
5989      {      {
5990      if (lengthptr == NULL)      if (lengthptr == NULL)
5991        {        {
# Line 5465  for (;;) Line 6006  for (;;)
6006      PUT(code, 1, code - start_bracket);      PUT(code, 1, code - start_bracket);
6007      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6008    
6009      /* Resetting option if needed */      /* If it was a capturing subpattern, check to see if it contained any
6010        recursive back references. If so, we must wrap it in atomic brackets.
6011        In any event, remove the block from the chain. */
6012    
6013        if (capnumber > 0)
6014          {
6015          if (cd->open_caps->flag)
6016            {
6017            memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6018              code - start_bracket);
6019            *start_bracket = OP_ONCE;
6020            code += 1 + LINK_SIZE;
6021            PUT(start_bracket, 1, code - start_bracket);
6022            *code = OP_KET;
6023            PUT(code, 1, code - start_bracket);
6024            code += 1 + LINK_SIZE;
6025            length += 2 + 2*LINK_SIZE;
6026            }
6027          cd->open_caps = cd->open_caps->next;
6028          }
6029    
6030        /* Reset options if needed. */
6031    
6032      if ((options & PCRE_IMS) != oldims && *ptr == ')')      if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6033        {        {
6034        *code++ = OP_OPT;        *code++ = OP_OPT;
6035        *code++ = oldims;        *code++ = oldims;
# Line 5514  for (;;) Line 6076  for (;;)
6076      {      {
6077      *code = OP_ALT;      *code = OP_ALT;
6078      PUT(code, 1, code - last_branch);      PUT(code, 1, code - last_branch);
6079      bc.current = last_branch = code;      bc.current_branch = last_branch = code;
6080      code += 1 + LINK_SIZE;      code += 1 + LINK_SIZE;
6081      }      }
6082    
# Line 5596  do { Line 6158  do {
6158       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6159       }       }
6160    
6161     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6162     are or may be referenced. */     it isn't in brackets that are or may be referenced. */
6163    
6164     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6165               op == OP_TYPEPOSSTAR) &&               op == OP_TYPEPOSSTAR))
             (*options & PCRE_DOTALL) != 0)  
6166       {       {
6167       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6168           return FALSE;
6169       }       }
6170    
6171     /* Check for explicit anchoring */     /* Check for explicit anchoring */
# Line 5649  do { Line 6211  do {
6211       NULL, 0, FALSE);       NULL, 0, FALSE);
6212     register int op = *scode;     register int op = *scode;
6213    
6214       /* If we are at the start of a conditional assertion group, *both* the
6215       conditional assertion *and* what follows the condition must satisfy the test
6216       for start of line. Other kinds of condition fail. Note that there may be an
6217       auto-callout at the start of a condition. */
6218    
6219       if (op == OP_COND)
6220         {
6221         scode += 1 + LINK_SIZE;
6222         if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6223         switch (*scode)
6224           {
6225           case OP_CREF:
6226           case OP_NCREF:
6227           case OP_RREF:
6228           case OP_NRREF:
6229           case OP_DEF:
6230           return FALSE;
6231    
6232           default:     /* Assertion */
6233           if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6234           do scode += GET(scode, 1); while (*scode == OP_ALT);
6235           scode += 1 + LINK_SIZE;
6236           break;
6237           }
6238         scode = first_significant_code(scode, NULL, 0, FALSE);
6239         op = *scode;
6240         }
6241    
6242     /* Non-capturing brackets */     /* Non-capturing brackets */
6243    
6244     if (op == OP_BRA)     if (op == OP_BRA)
# Line 5667  do { Line 6257  do {
6257    
6258     /* Other brackets */     /* Other brackets */
6259    
6260     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE)
6261       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       {
6262         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6263         }
6264    
6265     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
6266     may be referenced. */     may be referenced. */
# Line 5785  Returns: pointer to compiled data Line 6377  Returns: pointer to compiled data
6377                  with errorptr and erroroffset set                  with errorptr and erroroffset set
6378  */  */
6379    
6380  PCRE_EXP_DEFN pcre *  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6381  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
6382    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
6383  {  {
# Line 5793  return pcre_compile2(pattern, options, N Line 6385  return pcre_compile2(pattern, options, N
6385  }  }
6386    
6387    
6388  PCRE_EXP_DEFN pcre *  PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6389  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6390    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
6391  {  {
# Line 5802  int length = 1; /* For final END opcode Line 6394  int length = 1; /* For final END opcode
6394  int firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
6395  int errorcode = 0;  int errorcode = 0;
6396  int skipatstart = 0;  int skipatstart = 0;
6397  #ifdef SUPPORT_UTF8  BOOL utf8 = (options & PCRE_UTF8) != 0;
 BOOL utf8;  
 #endif  
6398  size_t size;  size_t size;
6399  uschar *code;  uschar *code;
6400  const uschar *codestart;  const uschar *codestart;
# Line 5847  if (erroroffset == NULL) Line 6437  if (erroroffset == NULL)
6437    
6438  *erroroffset = 0;  *erroroffset = 0;
6439    
 /* Can't support UTF8 unless PCRE has been compiled to include the code. */  
   
 #ifdef SUPPORT_UTF8  
 utf8 = (options & PCRE_UTF8) != 0;  
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&  
      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)  
   {  
   errorcode = ERR44;  
   goto PCRE_EARLY_ERROR_RETURN2;  
   }  
 #else  
 if ((options & PCRE_UTF8) != 0)  
   {  
   errorcode = ERR32;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
 #endif  
   
 if ((options & ~PUBLIC_OPTIONS) != 0)  
   {  
   errorcode = ERR17;  
   goto PCRE_EARLY_ERROR_RETURN;  
   }  
   
6440  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
6441    
6442  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
# Line 5879  cd->fcc = tables + fcc_offset; Line 6445  cd->fcc = tables + fcc_offset;
6445  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
6446  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
6447    
6448    /* Check that all undefined public option bits are zero */
6449    
6450    if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6451      {
6452      errorcode = ERR17;
6453      goto PCRE_EARLY_ERROR_RETURN;
6454      }
6455    
6456  /* Check for global one-time settings at the start of the pattern, and remember  /* Check for global one-time settings at the start of the pattern, and remember
6457  the offset for later. */  the offset for later. */
6458    
6459  while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')  while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6460           ptr[skipatstart+1] == CHAR_ASTERISK)
6461    {    {
6462    int newnl = 0;    int newnl = 0;
6463    int newbsr = 0;    int newbsr = 0;
6464    
6465    if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)    if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6466        { skipatstart += 7; options |= PCRE_UTF8; continue; }
6467    
6468      if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6469      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6470    else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6471      { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }      { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6472    else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6473      { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }      { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6474    else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6475      { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }      { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6476    else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6477      { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }      { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6478    
6479    else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6480      { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }      { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6481    else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)    else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6482      { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }      { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6483    
6484    if (newnl != 0)    if (newnl != 0)
# Line 5910  while (ptr[skipatstart] == '(' && ptr[sk Line 6488  while (ptr[skipatstart] == '(' && ptr[sk
6488    else break;    else break;
6489    }    }
6490    
6491    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6492    
6493    #ifdef SUPPORT_UTF8
6494    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6495         (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6496      {
6497      errorcode = ERR44;
6498      goto PCRE_EARLY_ERROR_RETURN2;
6499      }
6500    #else
6501    if (utf8)
6502      {
6503      errorcode = ERR32;
6504      goto PCRE_EARLY_ERROR_RETURN;
6505      }
6506    #endif
6507    
6508  /* Check validity of \R options. */  /* Check validity of \R options. */
6509    
6510  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
# Line 5928  current code allows for fixed one- or tw Line 6523  current code allows for fixed one- or tw
6523  switch (options & PCRE_NEWLINE_BITS)  switch (options & PCRE_NEWLINE_BITS)
6524    {    {
6525    case 0: newline = NEWLINE; break;   /* Build-time default */    case 0: newline = NEWLINE; break;   /* Build-time default */
6526    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6527    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6528    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
6529         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6530    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
6531    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6532    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
# Line 5992  cd->end_pattern = (const uschar *)(patte Line 6587  cd->end_pattern = (const uschar *)(patte
6587  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6588  cd->external_options = options;  cd->external_options = options;
6589  cd->external_flags = 0;  cd->external_flags = 0;
6590    cd->open_caps = NULL;
6591    
6592  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6593  don't need to look at the result of the function here. The initial options have  don't need to look at the result of the function here. The initial options have
# Line 6066  cd->start_code = codestart; Line 6662  cd->start_code = codestart;
6662  cd->hwm = cworkspace;  cd->hwm = cworkspace;
6663  cd->req_varyopt = 0;  cd->req_varyopt = 0;
6664  cd->had_accept = FALSE;  cd->had_accept = FALSE;
6665    cd->check_lookbehind = FALSE;
6666    cd->open_caps = NULL;
6667    
6668  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
6669  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 6091  if debugging, leave the test till after Line 6689  if debugging, leave the test till after
6689    
6690  *code++ = OP_END;  *code++ = OP_END;
6691    
6692  #ifndef DEBUG  #ifndef PCRE_DEBUG
6693  if (code - codestart > length) errorcode = ERR23;  if (code - codestart > length) errorcode = ERR23;
6694  #endif  #endif
6695    
# Line 6104  while (errorcode == 0 && cd->hwm > cwork Line 6702  while (errorcode == 0 && cd->hwm > cwork
6702    cd->hwm -= LINK_SIZE;    cd->hwm -= LINK_SIZE;
6703    offset = GET(cd->hwm, 0);    offset = GET(cd->hwm, 0);
6704    recno = GET(codestart, offset);    recno = GET(codestart, offset);
6705    groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);    groupptr = _pcre_find_bracket(codestart, utf8, recno);
6706    if (groupptr == NULL) errorcode = ERR53;    if (groupptr == NULL) errorcode = ERR53;
6707      else PUT(((uschar *)codestart), offset, groupptr - codestart);      else PUT(((uschar *)codestart), offset, groupptr - codestart);
6708    }    }
# Line 6114  subpattern. */ Line 6712  subpattern. */
6712    
6713  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;  if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6714    
6715    /* If there were any lookbehind assertions that contained OP_RECURSE
6716    (recursions or subroutine calls), a flag is set for them to be checked here,
6717    because they may contain forward references. Actual recursions can't be fixed
6718    length, but subroutine calls can. It is done like this so that those without
6719    OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6720    exceptional ones forgo this. We scan the pattern to check that they are fixed
6721    length, and set their lengths. */
6722    
6723    if (cd->check_lookbehind)
6724      {
6725      uschar *cc = (uschar *)codestart;
6726    
6727      /* Loop, searching for OP_REVERSE items, and process those that do not have
6728      their length set. (Actually, it will also re-process any that have a length
6729      of zero, but that is a pathological case, and it does no harm.) When we find
6730      one, we temporarily terminate the branch it is in while we scan it. */
6731    
6732      for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6733           cc != NULL;
6734           cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6735        {
6736        if (GET(cc, 1) == 0)
6737          {
6738          int fixed_length;
6739          uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6740          int end_op = *be;
6741          *be = OP_END;
6742          fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6743          *be = end_op;
6744          DPRINTF(("fixed length = %d\n", fixed_length));
6745          if (fixed_length < 0)
6746            {
6747            errorcode = (fixed_length == -2)? ERR36 : ERR25;
6748            break;
6749            }
6750          PUT(cc, 1, fixed_length);
6751          }
6752        cc += 1 + LINK_SIZE;
6753        }
6754      }
6755    
6756  /* Failed to compile, or error while post-processing */  /* Failed to compile, or error while post-processing */
6757    
6758  if (errorcode != 0)  if (errorcode != 0)
# Line 6174  if (reqbyte >= 0 && Line 6813  if (reqbyte >= 0 &&
6813  /* Print out the compiled data if debugging is enabled. This is never the  /* Print out the compiled data if debugging is enabled. This is never the
6814  case when building a production library. */  case when building a production library. */
6815    
6816  #ifdef DEBUG  #ifdef PCRE_DEBUG
   
6817  printf("Length = %d top_bracket = %d top_backref = %d\n",  printf("Length = %d top_bracket = %d top_backref = %d\n",
6818    length, re->top_bracket, re->top_backref);    length, re->top_bracket, re->top_backref);
6819    
# Line 6212  if (code - codestart > length) Line 6850  if (code - codestart > length)
6850    if (errorcodeptr != NULL) *errorcodeptr = ERR23;    if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6851    return NULL;    return NULL;
6852    }    }
6853  #endif   /* DEBUG */  #endif   /* PCRE_DEBUG */
6854    
6855  return (pcre *)re;  return (pcre *)re;
6856  }  }

Legend:
Removed from v.323  
changed lines
  Added in v.496

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12