/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 93 by nigel, Sat Feb 24 21:41:42 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 54  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
   
61  /*************************************************  /*************************************************
62  *      Code parameters and static tables         *  *      Code parameters and static tables         *
63  *************************************************/  *************************************************/
64    
65  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
66  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
67  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
68  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
69  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
70    so this number is very generous.
71    
72    The same workspace is used during the second, actual compile phase for
73    remembering forward references to groups so that they can be filled in at the
74    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75    is 4 there is plenty of room. */
76    
77  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
78    
79    
80  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 79  static const short int escapes[] = { Line 88  static const short int escapes[] = {
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
94       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
95  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
96       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
97  };  };
# Line 98  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
108  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
109  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
110  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
111  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
112  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
113  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
# Line 107  static const short int escapes[] = { Line 116  static const short int escapes[] = {
116  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
117  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
118  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
119  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
120  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
121  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
122  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
# Line 156  static const int posix_class_maps[] = { Line 165  static const int posix_class_maps[] = {
165  };  };
166    
167    
168    #define STRING(a)  # a
169    #define XSTRING(s) STRING(s)
170    
171  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
172  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
173    they are documented. Always add a new error instead. Messages marked DEAD below
174    are no longer used. */
175    
176  static const char *error_texts[] = {  static const char *error_texts[] = {
177    "no error",    "no error",
# Line 172  static const char *error_texts[] = { Line 186  static const char *error_texts[] = {
186    "range out of order in character class",    "range out of order in character class",
187    "nothing to repeat",    "nothing to repeat",
188    /* 10 */    /* 10 */
189    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
190    "internal error: unexpected repeat",    "internal error: unexpected repeat",
191    "unrecognized character after (?",    "unrecognized character after (?",
192    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 182  static const char *error_texts[] = { Line 196  static const char *error_texts[] = {
196    "erroffset passed as NULL",    "erroffset passed as NULL",
197    "unknown option bit(s) set",    "unknown option bit(s) set",
198    "missing ) after comment",    "missing ) after comment",
199    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
200    /* 20 */    /* 20 */
201    "regular expression too large",    "regular expression too large",
202    "failed to get memory",    "failed to get memory",
# Line 199  static const char *error_texts[] = { Line 213  static const char *error_texts[] = {
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
215    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
216    "spare error",    "spare error",  /** DEAD **/
217    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
218    /* 35 */    /* 35 */
219    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 210  static const char *error_texts[] = { Line 224  static const char *error_texts[] = {
224    /* 40 */    /* 40 */
225    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
226    "unrecognized character after (?P",    "unrecognized character after (?P",
227    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
228    "two named subpatterns have the same name",    "two named subpatterns have the same name",
229    "invalid UTF-8 string",    "invalid UTF-8 string",
230    /* 45 */    /* 45 */
231    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
232    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
233    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p",
234    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236    /* 50 */    /* 50 */
237    "repeated subpattern is too long",    "repeated subpattern is too long",
238    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)",
239      "internal error: overran compiling workspace",
240      "internal error: previously-checked referenced subpattern not found",
241      "DEFINE group contains more than one branch",
242      /* 55 */
243      "repeating a DEFINE group is not allowed",
244      "inconsistent NEWLINE options",
245      "\\g is not followed by an (optionally braced) non-zero number"
246  };  };
247    
248    
# Line 352  static const unsigned char ebcdic_charta Line 373  static const unsigned char ebcdic_charta
373  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
374    
375  static BOOL  static BOOL
376    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377      int *, int *, branch_chain *, compile_data *);      int *, branch_chain *, compile_data *, int *);
378    
379    
380    
# Line 363  static BOOL Line 384  static BOOL
384    
385  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
386  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
387  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
388  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390    ptr is pointing at the \. On exit, it is on the final character of the escape
391    sequence.
392    
393  Arguments:  Arguments:
394    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 435  else if ((i = escapes[c - 0x48]) != 0)
435  else  else
436    {    {
437    const uschar *oldptr;    const uschar *oldptr;
438      BOOL braced, negated;
439    
440    switch (c)    switch (c)
441      {      {
442      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 450  else
450      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
451      break;      break;
452    
453        /* \g must be followed by a number, either plain or braced. If positive, it
454        is an absolute backreference. If negative, it is a relative backreference.
455        This is a Perl 5.10 feature. */
456    
457        case 'g':
458        if (ptr[1] == '{')
459          {
460          braced = TRUE;
461          ptr++;
462          }
463        else braced = FALSE;
464    
465        if (ptr[1] == '-')
466          {
467          negated = TRUE;
468          ptr++;
469          }
470        else negated = FALSE;
471    
472        c = 0;
473        while ((digitab[ptr[1]] & ctype_digit) != 0)
474          c = c * 10 + *(++ptr) - '0';
475    
476        if (c == 0 || (braced && *(++ptr) != '}'))
477          {
478          *errorcodeptr = ERR57;
479          return 0;
480          }
481    
482        if (negated)
483          {
484          if (c > bracount)
485            {
486            *errorcodeptr = ERR15;
487            return 0;
488            }
489          c = bracount - (c - 1);
490          }
491    
492        c = -(ESC_REF + c);
493        break;
494    
495      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
496      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
497      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 532  else Line 599  else
599        }        }
600      break;      break;
601    
602      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603        This coding is ASCII-specific, but then the whole concept of \cx is
604        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605    
606      case 'c':      case 'c':
607      c = *(++ptr);      c = *(++ptr);
# Line 542  else Line 611  else
611        return 0;        return 0;
612        }        }
613    
     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
614  #if !EBCDIC    /* ASCII coding */  #if !EBCDIC    /* ASCII coding */
615      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
616      c ^= 0x40;      c ^= 0x40;
# Line 772  return p; Line 837  return p;
837    
838    
839  /*************************************************  /*************************************************
840  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
841  *************************************************/  *************************************************/
842    
843  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
844  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
845  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
846  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
847  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
848    be terminated by '>' because that is checked in the first pass.
849    
850  Arguments:  Arguments:
851    pointer      current position in the pattern    ptr          current position in the pattern
852    count        current count of capturing parens    count        current count of capturing parens so far encountered
853    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
854    namelen      name length    lorn         name length, or subpattern number if name is NULL
855      xmode        TRUE if we are in /x mode
856    
857  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
858  */  */
859    
860  static int  static int
861  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862      BOOL xmode)
863  {  {
864  const uschar *thisname;  const uschar *thisname;
865    
866  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
867    {    {
868    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
869    
870      /* Skip over backslashed characters and also entire \Q...\E */
871    
872      if (*ptr == '\\')
873        {
874        if (*(++ptr) == 0) return -1;
875        if (*ptr == 'Q') for (;;)
876          {
877          while (*(++ptr) != 0 && *ptr != '\\');
878          if (*ptr == 0) return -1;
879          if (*(++ptr) == 'E') break;
880          }
881        continue;
882        }
883    
884      /* Skip over character classes */
885    
886      if (*ptr == '[')
887        {
888        while (*(++ptr) != ']')
889          {
890          if (*ptr == '\\')
891            {
892            if (*(++ptr) == 0) return -1;
893            if (*ptr == 'Q') for (;;)
894              {
895              while (*(++ptr) != 0 && *ptr != '\\');
896              if (*ptr == 0) return -1;
897              if (*(++ptr) == 'E') break;
898              }
899            continue;
900            }
901          }
902        continue;
903        }
904    
905      /* Skip comments in /x mode */
906    
907      if (xmode && *ptr == '#')
908        {
909        while (*(++ptr) != 0 && *ptr != '\n');
910        if (*ptr == 0) return -1;
911        continue;
912        }
913    
914      /* An opening parens must now be a real metacharacter */
915    
916    if (*ptr != '(') continue;    if (*ptr != '(') continue;
917    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?')
918    if (ptr[2] == '(') { ptr += 2; continue; }      {
919    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
920        if (name == NULL && count == lorn) return count;
921        continue;
922        }
923    
924      ptr += 2;
925      if (*ptr == 'P') ptr++;                      /* Allow optional P */
926    
927      /* We have to disambiguate (?<! and (?<= from (?<name> */
928    
929      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930           *ptr != '\'')
931        continue;
932    
933    count++;    count++;
934    ptr += 4;  
935      if (name == NULL && count == lorn) return count;
936      term = *ptr++;
937      if (term == '<') term = '>';
938    thisname = ptr;    thisname = ptr;
939    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
940    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
941          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942      return count;      return count;
943    }    }
944    
945  return -1;  return -1;
946  }  }
947    
# Line 862  for (;;) Line 996  for (;;)
996    
997      case OP_CALLOUT:      case OP_CALLOUT:
998      case OP_CREF:      case OP_CREF:
999      case OP_BRANUMBER:      case OP_RREF:
1000        case OP_DEF:
1001      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1002      break;      break;
1003    
# Line 907  for (;;) Line 1042  for (;;)
1042    {    {
1043    int d;    int d;
1044    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1045    
1046    switch (op)    switch (op)
1047      {      {
1048        case OP_CBRA:
1049      case OP_BRA:      case OP_BRA:
1050      case OP_ONCE:      case OP_ONCE:
1051      case OP_COND:      case OP_COND:
1052      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053      if (d < 0) return d;      if (d < 0) return d;
1054      branchlength += d;      branchlength += d;
1055      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1084  for (;;)
1084      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1085    
1086      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1087      case OP_CREF:      case OP_CREF:
1088        case OP_RREF:
1089        case OP_DEF:
1090      case OP_OPT:      case OP_OPT:
1091      case OP_CALLOUT:      case OP_CALLOUT:
1092      case OP_SOD:      case OP_SOD:
# Line 1094  for (;;) Line 1230  for (;;)
1230    
1231    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1232    
1233    /* Handle bracketed group */    /* Handle capturing bracket */
1234    
1235    else if (c > OP_BRA)    else if (c == OP_CBRA)
1236      {      {
1237      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1238      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1239      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1240      }      }
1241    
1242    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243    that are followed by a character may be followed by a multi-byte character.    a multi-byte character. The length in the table is a minimum, so we have to
1244    The length in the table is a minimum, so we have to scan along to skip the    arrange to skip the extra bytes. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1245    
1246    else    else
1247      {      {
# Line 1120  for (;;) Line 1253  for (;;)
1253        case OP_EXACT:        case OP_EXACT:
1254        case OP_UPTO:        case OP_UPTO:
1255        case OP_MINUPTO:        case OP_MINUPTO:
1256          case OP_POSUPTO:
1257        case OP_STAR:        case OP_STAR:
1258        case OP_MINSTAR:        case OP_MINSTAR:
1259          case OP_POSSTAR:
1260        case OP_PLUS:        case OP_PLUS:
1261        case OP_MINPLUS:        case OP_MINPLUS:
1262          case OP_POSPLUS:
1263        case OP_QUERY:        case OP_QUERY:
1264        case OP_MINQUERY:        case OP_MINQUERY:
1265        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1266          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1267        break;        break;
1268        }        }
1269      }      }
# Line 1164  for (;;) Line 1301  for (;;)
1301    
1302    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1303    
   /* All bracketed groups have the same length. */  
   
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   
1304    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1305    that are followed by a character may be followed by a multi-byte character.    that are followed by a character may be followed by a multi-byte character.
1306    The length in the table is a minimum, so we have to scan along to skip the    The length in the table is a minimum, so we have to arrange to skip the extra
1307    extra bytes. All opcodes are less than 128, so we can use relatively    bytes. */
   efficient code. */  
1308    
1309    else    else
1310      {      {
# Line 1187  for (;;) Line 1316  for (;;)
1316        case OP_EXACT:        case OP_EXACT:
1317        case OP_UPTO:        case OP_UPTO:
1318        case OP_MINUPTO:        case OP_MINUPTO:
1319          case OP_POSUPTO:
1320        case OP_STAR:        case OP_STAR:
1321        case OP_MINSTAR:        case OP_MINSTAR:
1322          case OP_POSSTAR:
1323        case OP_PLUS:        case OP_PLUS:
1324        case OP_MINPLUS:        case OP_MINPLUS:
1325          case OP_POSPLUS:
1326        case OP_QUERY:        case OP_QUERY:
1327        case OP_MINQUERY:        case OP_MINQUERY:
1328        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1329          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1330        break;        break;
1331        }        }
1332      }      }
# Line 1207  for (;;) Line 1340  for (;;)
1340  *************************************************/  *************************************************/
1341    
1342  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1343  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1344  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1345  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1346  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1347    struck an inner bracket whose current branch will already have been scanned.
1348    
1349  Arguments:  Arguments:
1350    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1358  static BOOL
1358  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1359  {  {
1360  register int c;  register int c;
1361  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1362       code < endcode;       code < endcode;
1363       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1364    {    {
# Line 1232  for (code = first_significant_code(code Line 1366  for (code = first_significant_code(code
1366    
1367    c = *code;    c = *code;
1368    
1369    if (c >= OP_BRA)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1370      {      {
1371      BOOL empty_branch;      BOOL empty_branch;
1372      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1382  for (code = first_significant_code(code
1382        }        }
1383      while (*code == OP_ALT);      while (*code == OP_ALT);
1384      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1385      code += 1 + LINK_SIZE;  
1386      c = *code;      /* Move past the KET and fudge things so that the increment in the "for"
1387        above has no effect. */
1388    
1389        c = OP_END;
1390        code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1391        continue;
1392      }      }
1393    
1394    else switch (c)    /* Handle the other opcodes */
1395    
1396      switch (c)
1397      {      {
1398      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1399    
# Line 1308  for (code = first_significant_code(code Line 1449  for (code = first_significant_code(code
1449      case OP_NOT:      case OP_NOT:
1450      case OP_PLUS:      case OP_PLUS:
1451      case OP_MINPLUS:      case OP_MINPLUS:
1452        case OP_POSPLUS:
1453      case OP_EXACT:      case OP_EXACT:
1454      case OP_NOTPLUS:      case OP_NOTPLUS:
1455      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1456        case OP_NOTPOSPLUS:
1457      case OP_NOTEXACT:      case OP_NOTEXACT:
1458      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1459      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1460        case OP_TYPEPOSPLUS:
1461      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1462      return FALSE;      return FALSE;
1463    
# Line 1325  for (code = first_significant_code(code Line 1469  for (code = first_significant_code(code
1469      case OP_ALT:      case OP_ALT:
1470      return TRUE;      return TRUE;
1471    
1472      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1473      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1474    
1475  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1476      case OP_STAR:      case OP_STAR:
1477      case OP_MINSTAR:      case OP_MINSTAR:
1478        case OP_POSSTAR:
1479      case OP_QUERY:      case OP_QUERY:
1480      case OP_MINQUERY:      case OP_MINQUERY:
1481        case OP_POSQUERY:
1482      case OP_UPTO:      case OP_UPTO:
1483      case OP_MINUPTO:      case OP_MINUPTO:
1484        case OP_POSUPTO:
1485      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1486      break;      break;
1487  #endif  #endif
# Line 1452  earlier groups that are outside the curr Line 1599  earlier groups that are outside the curr
1599  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1600  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1601  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1602  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1603  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1604    
1605    This function has been extended with the possibility of forward references for
1606    recursions and subroutine calls. It must also check the list of such references
1607    for the group we are dealing with. If it finds that one of the recursions in
1608    the current group is on this list, it adjusts the offset in the list, not the
1609    value in the reference (which is a group number).
1610    
1611  Arguments:  Arguments:
1612    group      points to the start of the group    group      points to the start of the group
1613    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1614    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1615    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1616      save_hwm   the hwm forward reference pointer at the start of the group
1617    
1618  Returns:     nothing  Returns:     nothing
1619  */  */
1620    
1621  static void  static void
1622  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1623      uschar *save_hwm)
1624  {  {
1625  uschar *ptr = group;  uschar *ptr = group;
1626  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1627    {    {
1628    int offset = GET(ptr, 1);    int offset;
1629    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1630    
1631      /* See if this recursion is on the forward reference list. If so, adjust the
1632      reference. */
1633    
1634      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1635        {
1636        offset = GET(hc, 0);
1637        if (cd->start_code + offset == ptr + 1)
1638          {
1639          PUT(hc, 0, offset + adjust);
1640          break;
1641          }
1642        }
1643    
1644      /* Otherwise, adjust the recursion offset if it's after the start of this
1645      group. */
1646    
1647      if (hc >= cd->hwm)
1648        {
1649        offset = GET(ptr, 1);
1650        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1651        }
1652    
1653    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1654    }    }
1655  }  }
# Line 1550  Yield: TRUE when range returned; Line 1728  Yield: TRUE when range returned;
1728  */  */
1729    
1730  static BOOL  static BOOL
1731  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1732      unsigned int *odptr)
1733  {  {
1734  int c, othercase, next;  unsigned int c, othercase, next;
1735    
1736  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1737    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1738    
1739  if (c > d) return FALSE;  if (c > d) return FALSE;
1740    
# Line 1576  return TRUE; Line 1755  return TRUE;
1755  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1756    
1757    
1758    
1759    /*************************************************
1760    *     Check if auto-possessifying is possible    *
1761    *************************************************/
1762    
1763    /* This function is called for unlimited repeats of certain items, to see
1764    whether the next thing could possibly match the repeated item. If not, it makes
1765    sense to automatically possessify the repeated item.
1766    
1767    Arguments:
1768      op_code       the repeated op code
1769      this          data for this item, depends on the opcode
1770      utf8          TRUE in UTF-8 mode
1771      utf8_char     used for utf8 character bytes, NULL if not relevant
1772      ptr           next character in pattern
1773      options       options bits
1774      cd            contains pointers to tables etc.
1775    
1776    Returns:        TRUE if possessifying is wanted
1777    */
1778    
1779    static BOOL
1780    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1781      const uschar *ptr, int options, compile_data *cd)
1782    {
1783    int next;
1784    
1785    /* Skip whitespace and comments in extended mode */
1786    
1787    if ((options & PCRE_EXTENDED) != 0)
1788      {
1789      for (;;)
1790        {
1791        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1792        if (*ptr == '#')
1793          {
1794          while (*(++ptr) != 0)
1795            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1796          }
1797        else break;
1798        }
1799      }
1800    
1801    /* If the next item is one that we can handle, get its value. A non-negative
1802    value is a character, a negative value is an escape value. */
1803    
1804    if (*ptr == '\\')
1805      {
1806      int temperrorcode = 0;
1807      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1808      if (temperrorcode != 0) return FALSE;
1809      ptr++;    /* Point after the escape sequence */
1810      }
1811    
1812    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1813      {
1814    #ifdef SUPPORT_UTF8
1815      if (utf8) { GETCHARINC(next, ptr); } else
1816    #endif
1817      next = *ptr++;
1818      }
1819    
1820    else return FALSE;
1821    
1822    /* Skip whitespace and comments in extended mode */
1823    
1824    if ((options & PCRE_EXTENDED) != 0)
1825      {
1826      for (;;)
1827        {
1828        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1829        if (*ptr == '#')
1830          {
1831          while (*(++ptr) != 0)
1832            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1833          }
1834        else break;
1835        }
1836      }
1837    
1838    /* If the next thing is itself optional, we have to give up. */
1839    
1840    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1841      return FALSE;
1842    
1843    /* Now compare the next item with the previous opcode. If the previous is a
1844    positive single character match, "item" either contains the character or, if
1845    "item" is greater than 127 in utf8 mode, the character's bytes are in
1846    utf8_char. */
1847    
1848    
1849    /* Handle cases when the next item is a character. */
1850    
1851    if (next >= 0) switch(op_code)
1852      {
1853      case OP_CHAR:
1854    #ifdef SUPPORT_UTF8
1855      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1856    #endif
1857      return item != next;
1858    
1859      /* For CHARNC (caseless character) we must check the other case. If we have
1860      Unicode property support, we can use it to test the other case of
1861      high-valued characters. */
1862    
1863      case OP_CHARNC:
1864    #ifdef SUPPORT_UTF8
1865      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1866    #endif
1867      if (item == next) return FALSE;
1868    #ifdef SUPPORT_UTF8
1869      if (utf8)
1870        {
1871        unsigned int othercase;
1872        if (next < 128) othercase = cd->fcc[next]; else
1873    #ifdef SUPPORT_UCP
1874        othercase = _pcre_ucp_othercase((unsigned int)next);
1875    #else
1876        othercase = NOTACHAR;
1877    #endif
1878        return (unsigned int)item != othercase;
1879        }
1880      else
1881    #endif  /* SUPPORT_UTF8 */
1882      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1883    
1884      /* For OP_NOT, "item" must be a single-byte character. */
1885    
1886      case OP_NOT:
1887      if (next < 0) return FALSE;  /* Not a character */
1888      if (item == next) return TRUE;
1889      if ((options & PCRE_CASELESS) == 0) return FALSE;
1890    #ifdef SUPPORT_UTF8
1891      if (utf8)
1892        {
1893        unsigned int othercase;
1894        if (next < 128) othercase = cd->fcc[next]; else
1895    #ifdef SUPPORT_UCP
1896        othercase = _pcre_ucp_othercase(next);
1897    #else
1898        othercase = NOTACHAR;
1899    #endif
1900        return (unsigned int)item == othercase;
1901        }
1902      else
1903    #endif  /* SUPPORT_UTF8 */
1904      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1905    
1906      case OP_DIGIT:
1907      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1908    
1909      case OP_NOT_DIGIT:
1910      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1911    
1912      case OP_WHITESPACE:
1913      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1914    
1915      case OP_NOT_WHITESPACE:
1916      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1917    
1918      case OP_WORDCHAR:
1919      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1920    
1921      case OP_NOT_WORDCHAR:
1922      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1923    
1924      default:
1925      return FALSE;
1926      }
1927    
1928    
1929    /* Handle the case when the next item is \d, \s, etc. */
1930    
1931    switch(op_code)
1932      {
1933      case OP_CHAR:
1934      case OP_CHARNC:
1935    #ifdef SUPPORT_UTF8
1936      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1937    #endif
1938      switch(-next)
1939        {
1940        case ESC_d:
1941        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1942    
1943        case ESC_D:
1944        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1945    
1946        case ESC_s:
1947        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1948    
1949        case ESC_S:
1950        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1951    
1952        case ESC_w:
1953        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1954    
1955        case ESC_W:
1956        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1957    
1958        default:
1959        return FALSE;
1960        }
1961    
1962      case OP_DIGIT:
1963      return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1964    
1965      case OP_NOT_DIGIT:
1966      return next == -ESC_d;
1967    
1968      case OP_WHITESPACE:
1969      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1970    
1971      case OP_NOT_WHITESPACE:
1972      return next == -ESC_s;
1973    
1974      case OP_WORDCHAR:
1975      return next == -ESC_W || next == -ESC_s;
1976    
1977      case OP_NOT_WORDCHAR:
1978      return next == -ESC_w || next == -ESC_d;
1979    
1980      default:
1981      return FALSE;
1982      }
1983    
1984    /* Control does not reach here */
1985    }
1986    
1987    
1988    
1989  /*************************************************  /*************************************************
1990  *           Compile one branch                   *  *           Compile one branch                   *
1991  *************************************************/  *************************************************/
1992    
1993  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
1994  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
1995  bits.  bits. This function is used during the pre-compile phase when we are trying
1996    to find out the amount of memory needed, as well as during the real compile
1997    phase. The value of lengthptr distinguishes the two phases.
1998    
1999  Arguments:  Arguments:
2000    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2001    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2002    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2003    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1594  Arguments: Line 2005  Arguments:
2005    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2006    bcptr          points to current branch chain    bcptr          points to current branch chain
2007    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2008      lengthptr      NULL during the real compile phase
2009                     points to length accumulator during pre-compile phase
2010    
2011  Returns:         TRUE on success  Returns:         TRUE on success
2012                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2013  */  */
2014    
2015  static BOOL  static BOOL
2016  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2017    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2018    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2019  {  {
2020  int repeat_type, op_type;  int repeat_type, op_type;
2021  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1613  int zeroreqbyte, zerofirstbyte; Line 2026  int zeroreqbyte, zerofirstbyte;
2026  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
2027  int options = *optionsptr;  int options = *optionsptr;
2028  int after_manual_callout = 0;  int after_manual_callout = 0;
2029    int length_prevgroup = 0;
2030  register int c;  register int c;
2031  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2032    uschar *last_code = code;
2033    uschar *orig_code = code;
2034  uschar *tempcode;  uschar *tempcode;
2035  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2036  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1622  const uschar *ptr = *ptrptr; Line 2038  const uschar *ptr = *ptrptr;
2038  const uschar *tempptr;  const uschar *tempptr;
2039  uschar *previous = NULL;  uschar *previous = NULL;
2040  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2041    uschar *save_hwm = NULL;
2042  uschar classbits[32];  uschar classbits[32];
2043    
2044  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1631  uschar *class_utf8data; Line 2048  uschar *class_utf8data;
2048  uschar utf8_char[6];  uschar utf8_char[6];
2049  #else  #else
2050  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2051    uschar *utf8_char = NULL;
2052    #endif
2053    
2054    #ifdef DEBUG
2055    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2056  #endif  #endif
2057    
2058  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1664  for (;; ptr++) Line 2086  for (;; ptr++)
2086    BOOL negate_class;    BOOL negate_class;
2087    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2088    BOOL is_quantifier;    BOOL is_quantifier;
2089      BOOL is_recurse;
2090    int class_charcount;    int class_charcount;
2091    int class_lastchar;    int class_lastchar;
2092    int newoptions;    int newoptions;
# Line 1671  for (;; ptr++) Line 2094  for (;; ptr++)
2094    int skipbytes;    int skipbytes;
2095    int subreqbyte;    int subreqbyte;
2096    int subfirstbyte;    int subfirstbyte;
2097      int terminator;
2098    int mclength;    int mclength;
2099    uschar mcbuffer[8];    uschar mcbuffer[8];
2100    
2101    /* Next byte in the pattern */    /* Get next byte in the pattern */
2102    
2103    c = *ptr;    c = *ptr;
2104    
2105      /* If we are in the pre-compile phase, accumulate the length used for the
2106      previous cycle of this loop. */
2107    
2108      if (lengthptr != NULL)
2109        {
2110    #ifdef DEBUG
2111        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2112    #endif
2113        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2114          {
2115          *errorcodeptr = ERR52;
2116          goto FAILED;
2117          }
2118    
2119        /* There is at least one situation where code goes backwards: this is the
2120        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2121        the class is simply eliminated. However, it is created first, so we have to
2122        allow memory for it. Therefore, don't ever reduce the length at this point.
2123        */
2124    
2125        if (code < last_code) code = last_code;
2126        *lengthptr += code - last_code;
2127        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2128    
2129        /* If "previous" is set and it is not at the start of the work space, move
2130        it back to there, in order to avoid filling up the work space. Otherwise,
2131        if "previous" is NULL, reset the current code pointer to the start. */
2132    
2133        if (previous != NULL)
2134          {
2135          if (previous > orig_code)
2136            {
2137            memmove(orig_code, previous, code - previous);
2138            code -= previous - orig_code;
2139            previous = orig_code;
2140            }
2141          }
2142        else code = orig_code;
2143    
2144        /* Remember where this code item starts so we can pick up the length
2145        next time round. */
2146    
2147        last_code = code;
2148        }
2149    
2150      /* In the real compile phase, just check the workspace used by the forward
2151      reference list. */
2152    
2153      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2154        {
2155        *errorcodeptr = ERR52;
2156        goto FAILED;
2157        }
2158    
2159    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2160    
2161    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1692  for (;; ptr++) Line 2170  for (;; ptr++)
2170        {        {
2171        if (previous_callout != NULL)        if (previous_callout != NULL)
2172          {          {
2173          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2174              complete_callout(previous_callout, ptr, cd);
2175          previous_callout = NULL;          previous_callout = NULL;
2176          }          }
2177        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2192  for (;; ptr++)
2192    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2193         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2194      {      {
2195      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2196          complete_callout(previous_callout, ptr, cd);
2197      previous_callout = NULL;      previous_callout = NULL;
2198      }      }
2199    
# Line 1724  for (;; ptr++) Line 2204  for (;; ptr++)
2204      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2205      if (c == '#')      if (c == '#')
2206        {        {
2207        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2208          {          {
2209          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2210          }          }
2211          if (*ptr != 0) continue;
2212    
2213        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2214        c = 0;        c = 0;
2215        }        }
# Line 1745  for (;; ptr++) Line 2225  for (;; ptr++)
2225    
2226    switch(c)    switch(c)
2227      {      {
2228      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2229        case 0:                        /* The branch terminates at string end */
2230      case 0:      case '|':                      /* or | or ) */
     case '|':  
2231      case ')':      case ')':
2232      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2233      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2234      *codeptr = code;      *codeptr = code;
2235      *ptrptr = ptr;      *ptrptr = ptr;
2236        if (lengthptr != NULL)
2237          {
2238          *lengthptr += code - last_code;   /* To include callout length */
2239          DPRINTF((">> end branch\n"));
2240          }
2241      return TRUE;      return TRUE;
2242    
2243    
2244        /* ===================================================================*/
2245      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2246      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2247    
# Line 1784  for (;; ptr++) Line 2270  for (;; ptr++)
2270      *code++ = OP_ANY;      *code++ = OP_ANY;
2271      break;      break;
2272    
2273    
2274        /* ===================================================================*/
2275      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2276      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2277      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1822  for (;; ptr++) Line 2310  for (;; ptr++)
2310        }        }
2311    
2312      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2313      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2314      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2315    
2316      class_charcount = 0;      class_charcount = 0;
2317      class_lastchar = -1;      class_lastchar = -1;
2318    
2319        /* Initialize the 32-char bit map to all zeros. We build the map in a
2320        temporary bit of memory, in case the class contains only 1 character (less
2321        than 256), because in that case the compiled code doesn't use the bit map.
2322        */
2323    
2324        memset(classbits, 0, 32 * sizeof(uschar));
2325    
2326  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2327      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2328      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2329  #endif  #endif
2330    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2331      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2332      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2333      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2334    
2335      do      if (c != 0) do
2336        {        {
2337          const uschar *oldptr;
2338    
2339  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2340        if (utf8 && c > 127)        if (utf8 && c > 127)
2341          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1859  for (;; ptr++) Line 2347  for (;; ptr++)
2347    
2348        if (inescq)        if (inescq)
2349          {          {
2350          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2351            {            {
2352            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2353            ptr++;            ptr++;                            /* Skip the 'E' */
2354            continue;            continue;                         /* Carry on with next */
2355            }            }
2356          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2357          }          }
2358    
2359        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1956  for (;; ptr++) Line 2444  for (;; ptr++)
2444          }          }
2445    
2446        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2447        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2448        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2449        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2450        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2451        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2452    
2453        if (c == '\\')        if (c == '\\')
2454          {          {
2455          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2456            if (*errorcodeptr != 0) goto FAILED;
2457    
2458          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2459          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2460            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2461          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2462            {            {
2463            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1983  for (;; ptr++) Line 2472  for (;; ptr++)
2472            {            {
2473            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2474            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2475            switch (-c)  
2476              /* Save time by not doing this in the pre-compile phase. */
2477    
2478              if (lengthptr == NULL) switch (-c)
2479              {              {
2480              case ESC_d:              case ESC_d:
2481              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 2011  for (;; ptr++) Line 2503  for (;; ptr++)
2503              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2504              continue;              continue;
2505    
2506  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = ptype;  
               *class_utf8data++ = pdata;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2507              continue;              continue;
 #endif  
   
             /* Unrecognized escapes are faulted if PCRE is running in its  
             strict mode. By default, for compatibility with Perl, they are  
             treated as literals. */  
2508    
2509              default:              default:    /* Not recognized; fall through */
2510              if ((options & PCRE_EXTRA) != 0)              break;      /* Need "default" setting to stop compiler warning. */
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2511              }              }
           }  
2512    
2513          /* Fall through if we have a single character (c >= 0). This may be            /* In the pre-compile phase, just do the recognition. */
         > 256 in UTF-8 mode. */  
2514    
2515          }   /* End of backslash handling */            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2516                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2517    
2518              /* We need to deal with \P and \p in both phases. */
2519    
2520    #ifdef SUPPORT_UCP
2521              if (-c == ESC_p || -c == ESC_P)
2522                {
2523                BOOL negated;
2524                int pdata;
2525                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2526                if (ptype < 0) goto FAILED;
2527                class_utf8 = TRUE;
2528                *class_utf8data++ = ((-c == ESC_p) != negated)?
2529                  XCL_PROP : XCL_NOTPROP;
2530                *class_utf8data++ = ptype;
2531                *class_utf8data++ = pdata;
2532                class_charcount -= 2;   /* Not a < 256 character */
2533                continue;
2534                }
2535    #endif
2536              /* Unrecognized escapes are faulted if PCRE is running in its
2537              strict mode. By default, for compatibility with Perl, they are
2538              treated as literals. */
2539    
2540              if ((options & PCRE_EXTRA) != 0)
2541                {
2542                *errorcodeptr = ERR7;
2543                goto FAILED;
2544                }
2545    
2546              class_charcount -= 2;  /* Undo the default count from above */
2547              c = *ptr;              /* Get the final character and fall through */
2548              }
2549    
2550            /* Fall through if we have a single character (c >= 0). This may be
2551            greater than 256 in UTF-8 mode. */
2552    
2553            }   /* End of backslash handling */
2554    
2555        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2556        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2557        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2558          entirely. The code for handling \Q and \E is messy. */
2559    
2560          CHECK_RANGE:
2561          while (ptr[1] == '\\' && ptr[2] == 'E')
2562            {
2563            inescq = FALSE;
2564            ptr += 2;
2565            }
2566    
2567          oldptr = ptr;
2568    
2569        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2570          {          {
2571          int d;          int d;
2572          ptr += 2;          ptr += 2;
2573            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2574    
2575            /* If we hit \Q (not followed by \E) at this point, go into escaped
2576            mode. */
2577    
2578            while (*ptr == '\\' && ptr[1] == 'Q')
2579              {
2580              ptr += 2;
2581              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2582              inescq = TRUE;
2583              break;
2584              }
2585    
2586            if (*ptr == 0 || (!inescq && *ptr == ']'))
2587              {
2588              ptr = oldptr;
2589              goto LONE_SINGLE_CHARACTER;
2590              }
2591    
2592  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2593          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 2602  for (;; ptr++)
2602          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2603          in such circumstances. */          in such circumstances. */
2604    
2605          if (d == '\\')          if (!inescq && d == '\\')
2606            {            {
2607            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2608            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2609    
2610            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2611            was literal */            special means the '-' was literal */
2612    
2613            if (d < 0)            if (d < 0)
2614              {              {
2615              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2616              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2617                else if (d == -ESC_R) d = 'R'; else
2618                {                {
2619                ptr = oldptr - 2;                ptr = oldptr;
2620                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2621                }                }
2622              }              }
2623            }            }
2624    
2625          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2626          the pre-pass. Optimize one-character ranges */          one-character ranges */
2627    
2628            if (d < c)
2629              {
2630              *errorcodeptr = ERR8;
2631              goto FAILED;
2632              }
2633    
2634          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2635    
# Line 2112  for (;; ptr++) Line 2650  for (;; ptr++)
2650  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2651            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2652              {              {
2653              int occ, ocd;              unsigned int occ, ocd;
2654              int cc = c;              unsigned int cc = c;
2655              int origd = d;              unsigned int origd = d;
2656              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2657                {                {
2658                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
# Line 2172  for (;; ptr++) Line 2710  for (;; ptr++)
2710          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2711          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2712    
2713          for (; c <= d; c++)          class_charcount += d - c + 1;
2714            class_lastchar = d;
2715    
2716            /* We can save a bit of time by skipping this in the pre-compile. */
2717    
2718            if (lengthptr == NULL) for (; c <= d; c++)
2719            {            {
2720            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2721            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 2723  for (;; ptr++)
2723              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2724              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2725              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2726            }            }
2727    
2728          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 2746  for (;; ptr++)
2746  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2747          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
2748            {            {
2749            int othercase;            unsigned int othercase;
2750            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2751              {              {
2752              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
2753              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 2772  for (;; ptr++)
2772          }          }
2773        }        }
2774    
2775      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
2776    
2777      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2778    
2779        if (c == 0)                          /* Missing terminating ']' */
2780          {
2781          *errorcodeptr = ERR6;
2782          goto FAILED;
2783          }
2784    
2785      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
2786      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2298  for (;; ptr++) Line 2844  for (;; ptr++)
2844    
2845      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
2846      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
2847      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
2848    
2849  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2850      if (class_utf8)      if (class_utf8)
# Line 2308  for (;; ptr++) Line 2854  for (;; ptr++)
2854        code += LINK_SIZE;        code += LINK_SIZE;
2855        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
2856    
2857        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
2858        the extra data */        otherwise just move the code pointer to the end of the extra data. */
2859    
2860        if (class_charcount > 0)        if (class_charcount > 0)
2861          {          {
2862          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
2863            memmove(code + 32, code, class_utf8data - code);
2864          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
2865          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
2866          }          }
2867          else code = class_utf8data;
2868    
2869        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
2870    
# Line 2342  for (;; ptr++) Line 2881  for (;; ptr++)
2881      if (negate_class)      if (negate_class)
2882        {        {
2883        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
2884        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2885            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2886        }        }
2887      else      else
2888        {        {
# Line 2352  for (;; ptr++) Line 2892  for (;; ptr++)
2892      code += 32;      code += 32;
2893      break;      break;
2894    
2895    
2896        /* ===================================================================*/
2897      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2898      has been tested above. */      has been tested above. */
2899    
# Line 2419  for (;; ptr++) Line 2961  for (;; ptr++)
2961        }        }
2962      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2963    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
2964      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
2965      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
2966      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 2994  for (;; ptr++)
2994          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2995          }          }
2996    
2997          /* If the repetition is unlimited, it pays to see if the next thing on
2998          the line is something that cannot possibly match this character. If so,
2999          automatically possessifying this item gains some performance in the case
3000          where the match fails. */
3001    
3002          if (!possessive_quantifier &&
3003              repeat_max < 0 &&
3004              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3005                options, cd))
3006            {
3007            repeat_type = 0;    /* Force greedy */
3008            possessive_quantifier = TRUE;
3009            }
3010    
3011        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3012        }        }
3013    
3014      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3015      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3016      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3017      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3018        currently used only for single-byte chars. */
3019    
3020      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3021        {        {
3022        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3023        c = previous[1];        c = previous[1];
3024          if (!possessive_quantifier &&
3025              repeat_max < 0 &&
3026              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3027            {
3028            repeat_type = 0;    /* Force greedy */
3029            possessive_quantifier = TRUE;
3030            }
3031        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3032        }        }
3033    
# Line 2495  for (;; ptr++) Line 3045  for (;; ptr++)
3045        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3046        c = *previous;        c = *previous;
3047    
3048          if (!possessive_quantifier &&
3049              repeat_max < 0 &&
3050              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3051            {
3052            repeat_type = 0;    /* Force greedy */
3053            possessive_quantifier = TRUE;
3054            }
3055    
3056        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3057        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3058          {          {
# Line 2535  for (;; ptr++) Line 3093  for (;; ptr++)
3093          }          }
3094    
3095        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3096        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3097        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3098        one less than the maximum. */        one less than the maximum. */
3099    
# Line 2588  for (;; ptr++) Line 3146  for (;; ptr++)
3146            }            }
3147    
3148          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3149          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3150            UPTO is just for 1 instance, we can use QUERY instead. */
3151    
3152          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3153            {            {
# Line 2607  for (;; ptr++) Line 3166  for (;; ptr++)
3166              *code++ = prop_value;              *code++ = prop_value;
3167              }              }
3168            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3169            *code++ = OP_UPTO + repeat_type;  
3170            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3171                {
3172                *code++ = OP_QUERY + repeat_type;
3173                }
3174              else
3175                {
3176                *code++ = OP_UPTO + repeat_type;
3177                PUT2INC(code, 0, repeat_max);
3178                }
3179            }            }
3180          }          }
3181    
# Line 2675  for (;; ptr++) Line 3242  for (;; ptr++)
3242      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3243      cases. */      cases. */
3244    
3245      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3246               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3247        {        {
3248        register int i;        register int i;
3249        int ketoffset = 0;        int ketoffset = 0;
3250        int len = code - previous;        int len = code - previous;
3251        uschar *bralink = NULL;        uschar *bralink = NULL;
3252    
3253          /* Repeating a DEFINE group is pointless */
3254    
3255          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3256            {
3257            *errorcodeptr = ERR55;
3258            goto FAILED;
3259            }
3260    
3261          /* This is a paranoid check to stop integer overflow later on */
3262    
3263          if (len > MAX_DUPLENGTH)
3264            {
3265            *errorcodeptr = ERR50;
3266            goto FAILED;
3267            }
3268    
3269        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3270        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3271        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2717  for (;; ptr++) Line 3300  for (;; ptr++)
3300          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3301          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3302          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3303          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3304          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3305            doing this. */
3306    
3307          if (repeat_max <= 1)          if (repeat_max <= 1)
3308            {            {
3309            *code = OP_END;            *code = OP_END;
3310            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3311            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3312            code++;            code++;
3313            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2741  for (;; ptr++) Line 3325  for (;; ptr++)
3325            {            {
3326            int offset;            int offset;
3327            *code = OP_END;            *code = OP_END;
3328            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3329            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3330            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3331            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3345  for (;; ptr++)
3345        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3346        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3347        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3348        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3349          forward reference subroutine calls in the group, there will be entries on
3350          the workspace list; replicate these with an appropriate increment. */
3351    
3352        else        else
3353          {          {
3354          if (repeat_min > 1)          if (repeat_min > 1)
3355            {            {
3356            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3357            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3358    
3359              if (lengthptr != NULL)
3360                *lengthptr += (repeat_min - 1)*length_prevgroup;
3361    
3362              /* This is compiling for real */
3363    
3364              else
3365              {              {
3366              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3367              code += len;              for (i = 1; i < repeat_min; i++)
3368                  {
3369                  uschar *hc;
3370                  uschar *this_hwm = cd->hwm;
3371                  memcpy(code, previous, len);
3372                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3373                    {
3374                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3375                    cd->hwm += LINK_SIZE;
3376                    }
3377                  save_hwm = this_hwm;
3378                  code += len;
3379                  }
3380              }              }
3381            }            }
3382    
3383          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3384          }          }
3385    
# Line 2781  for (;; ptr++) Line 3387  for (;; ptr++)
3387        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3388        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3389        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3390        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3391          replicate entries on the forward reference list. */
3392    
3393        if (repeat_max >= 0)        if (repeat_max >= 0)
3394          {          {
3395          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3396            just adjust the length as if we had. For each repetition we must add 1
3397            to the length for BRAZERO and for all but the last repetition we must
3398            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3399    
3400            if (lengthptr != NULL && repeat_max > 0)
3401              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3402                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3403    
3404            /* This is compiling for real */
3405    
3406            else for (i = repeat_max - 1; i >= 0; i--)
3407            {            {
3408              uschar *hc;
3409              uschar *this_hwm = cd->hwm;
3410    
3411            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3412    
3413            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 3423  for (;; ptr++)
3423              }              }
3424    
3425            memcpy(code, previous, len);            memcpy(code, previous, len);
3426              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3427                {
3428                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3429                cd->hwm += LINK_SIZE;
3430                }
3431              save_hwm = this_hwm;
3432            code += len;            code += len;
3433            }            }
3434    
# Line 2824  for (;; ptr++) Line 3451  for (;; ptr++)
3451        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3452        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3453        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3454        correct offset was computed above. */        correct offset was computed above.
3455    
3456          Then, when we are doing the actual compile phase, check to see whether
3457          this group is a non-atomic one that could match an empty string. If so,
3458          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3459          that runtime checking can be done. [This check is also applied to
3460          atomic groups at runtime, but in a different way.] */
3461    
3462        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3463            {
3464            uschar *ketcode = code - ketoffset;
3465            uschar *bracode = ketcode - GET(ketcode, 1);
3466            *ketcode = OP_KETRMAX + repeat_type;
3467            if (lengthptr == NULL && *bracode != OP_ONCE)
3468              {
3469              uschar *scode = bracode;
3470              do
3471                {
3472                if (could_be_empty_branch(scode, ketcode, utf8))
3473                  {
3474                  *bracode += OP_SBRA - OP_BRA;
3475                  break;
3476                  }
3477                scode += GET(scode, 1);
3478                }
3479              while (*scode == OP_ALT);
3480              }
3481            }
3482        }        }
3483    
3484      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2837  for (;; ptr++) Line 3489  for (;; ptr++)
3489        goto FAILED;        goto FAILED;
3490        }        }
3491    
3492      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3493      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3494      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3495      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3496      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3497        but the special opcodes can optimize it a bit. The repeated item starts at
3498        tempcode, not at previous, which might be the first part of a string whose
3499        (former) last char we repeated.
3500    
3501        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3502        an 'upto' may follow. We skip over an 'exact' item, and then test the
3503        length of what remains before proceeding. */
3504    
3505      if (possessive_quantifier)      if (possessive_quantifier)
3506        {        {
3507        int len = code - tempcode;        int len;
3508        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3509        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3510        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3511        tempcode[0] = OP_ONCE;        len = code - tempcode;
3512        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3513        PUTINC(code, 0, len);          {
3514        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3515            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3516            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3517            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3518    
3519            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3520            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3521            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3522            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3523    
3524            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3525            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3526            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3527            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3528    
3529            default:
3530            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3531            code += 1 + LINK_SIZE;
3532            len += 1 + LINK_SIZE;
3533            tempcode[0] = OP_ONCE;
3534            *code++ = OP_KET;
3535            PUTINC(code, 0, len);
3536            PUT(tempcode, 1, len);
3537            break;
3538            }
3539        }        }
3540    
3541      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 3548  for (;; ptr++)
3548      break;      break;
3549    
3550    
3551      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3552      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3553      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3554      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3555      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3556      check for syntax errors here.  */      group. */
3557    
3558      case '(':      case '(':
3559      newoptions = options;      newoptions = options;
3560      skipbytes = 0;      skipbytes = 0;
3561        bravalue = OP_CBRA;
3562        save_hwm = cd->hwm;
3563    
3564      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3565        {        {
3566        int set, unset;        int i, set, unset, namelen;
3567        int *optset;        int *optset;
3568          const uschar *name;
3569          uschar *slot;
3570    
3571        switch (*(++ptr))        switch (*(++ptr))
3572          {          {
3573          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3574          ptr++;          ptr++;
3575          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3576            if (*ptr == 0)
3577              {
3578              *errorcodeptr = ERR18;
3579              goto FAILED;
3580              }
3581          continue;          continue;
3582    
3583          case ':':                 /* Non-extracting bracket */  
3584            /* ------------------------------------------------------------ */
3585            case ':':                 /* Non-capturing bracket */
3586          bravalue = OP_BRA;          bravalue = OP_BRA;
3587          ptr++;          ptr++;
3588          break;          break;
3589    
3590    
3591            /* ------------------------------------------------------------ */
3592          case '(':          case '(':
3593          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3594    
3595          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
3596          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
3597          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
3598          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
3599          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
3600          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3601          character after (?( is a word character, we know the rest up to ) will  
3602          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
3603          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
3604            by digits), and (b) a number could be a name that consists of digits.
3605          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
3606            {          cases. */
3607            int i, namelen;  
3608            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
3609            const uschar *name;          the switch. This will take control down to where bracketed groups,
3610            uschar *slot = cd->name_table;          including assertions, are processed. */
3611    
3612            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3613              break;
3614    
3615            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
3616            below), and all need to skip 3 bytes at the start of the group. */
3617    
3618            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
3619            skipbytes = 3;
3620    
3621            name = ++ptr;          /* Check for a test for recursion in a named group. */
3622            while (*ptr != ')')  
3623              {          if (ptr[1] == 'R' && ptr[2] == '&')
3624              if (condref >= 0)            {
3625                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
3626                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
3627              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3628              }            }
3629            namelen = ptr - name;  
3630            /* Check for a test for a named group's having been set, using the Perl
3631            syntax (?(<name>) or (?('name') */
3632    
3633            else if (ptr[1] == '<')
3634              {
3635              terminator = '>';
3636            ptr++;            ptr++;
3637              }
3638            else if (ptr[1] == '\'')
3639              {
3640              terminator = '\'';
3641              ptr++;
3642              }
3643            else terminator = 0;
3644    
3645            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
3646    
3647            /* Found a previous named subpattern */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3648              {
3649              ptr += 1;  /* To get the right offset */
3650              *errorcodeptr = ERR28;
3651              goto FAILED;
3652              }
3653    
3654            if (i < cd->names_found)          /* Read the name, but also get it as a number if it's all digits */
             {  
             condref = GET2(slot, 0);  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, condref);  
             }  
3655    
3656            /* Search the pattern for a forward reference */          recno = 0;
3657            name = ++ptr;
3658            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3659              {
3660              if (recno >= 0)
3661                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3662                  recno * 10 + *ptr - '0' : -1;
3663              ptr++;
3664              }
3665            namelen = ptr - name;
3666    
3667            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3668              {            {
3669              code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3670              PUT2(code, 2+LINK_SIZE, i);            *errorcodeptr = ERR26;
3671              }            goto FAILED;
3672              }
3673    
3674            /* Check for 'R' for recursion */          /* Do no further checking in the pre-compile phase. */
3675    
3676            else if (namelen == 1 && *name == 'R')          if (lengthptr != NULL) break;
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, CREF_RECURSE);  
             }  
3677    
3678            /* Check for a subpattern number */          /* In the real compile we do the work of looking for the actual
3679            reference. */
3680    
3681            else if (condref > 0)          slot = cd->name_table;
3682              {          for (i = 0; i < cd->names_found; i++)
3683              code[1+LINK_SIZE] = OP_CREF;            {
3684              PUT2(code, 2+LINK_SIZE, condref);            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3685              }            slot += cd->name_entry_size;
3686              }
3687    
3688            /* Either an unidentified subpattern, or a reference to (?(0) */          /* Found a previous named subpattern */
3689    
3690            else          if (i < cd->names_found)
3691              {
3692              recno = GET2(slot, 0);
3693              PUT2(code, 2+LINK_SIZE, recno);
3694              }
3695    
3696            /* Search the pattern for a forward reference */
3697    
3698            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3699                            (options & PCRE_EXTENDED) != 0)) > 0)
3700              {
3701              PUT2(code, 2+LINK_SIZE, i);
3702              }
3703    
3704            /* If terminator == 0 it means that the name followed directly after
3705            the opening parenthesis [e.g. (?(abc)...] and in this case there are
3706            some further alternatives to try. For the cases where terminator != 0
3707            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3708            now checked all the possibilities, so give an error. */
3709    
3710            else if (terminator != 0)
3711              {
3712              *errorcodeptr = ERR15;
3713              goto FAILED;
3714              }
3715    
3716            /* Check for (?(R) for recursion. Allow digits after R to specify a
3717            specific group number. */
3718    
3719            else if (*name == 'R')
3720              {
3721              recno = 0;
3722              for (i = 1; i < namelen; i++)
3723              {              {
3724              *errorcodeptr = (condref == 0)? ERR35: ERR15;              if ((digitab[name[i]] & ctype_digit) == 0)
3725              goto FAILED;                {
3726                  *errorcodeptr = ERR15;
3727                  goto FAILED;
3728                  }
3729                recno = recno * 10 + name[i] - '0';
3730              }              }
3731              if (recno == 0) recno = RREF_ANY;
3732              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3733              PUT2(code, 2+LINK_SIZE, recno);
3734              }
3735    
3736            /* Similarly, check for the (?(DEFINE) "condition", which is always
3737            false. */
3738    
3739            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3740              {
3741              code[1+LINK_SIZE] = OP_DEF;
3742              skipbytes = 1;
3743              }
3744    
3745            /* Check for the "name" actually being a subpattern number. */
3746    
3747            else if (recno > 0)
3748              {
3749              PUT2(code, 2+LINK_SIZE, recno);
3750            }            }
3751    
3752          /* For conditions that are assertions, we just fall through, having          /* Either an unidentified subpattern, or a reference to (?(0) */
         set bravalue above. */  
3753    
3754            else
3755              {
3756              *errorcodeptr = (recno == 0)? ERR35: ERR15;
3757              goto FAILED;
3758              }
3759          break;          break;
3760    
3761    
3762            /* ------------------------------------------------------------ */
3763          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
3764          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
3765          ptr++;          ptr++;
3766          break;          break;
3767    
3768    
3769            /* ------------------------------------------------------------ */
3770          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
3771          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
3772          ptr++;          ptr++;
3773          break;          break;
3774    
3775          case '<':                 /* Lookbehinds */  
3776          switch (*(++ptr))          /* ------------------------------------------------------------ */
3777            case '<':                 /* Lookbehind or named define */
3778            switch (ptr[1])
3779            {            {
3780            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
3781            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
3782            ptr++;            ptr += 2;
3783            break;            break;
3784    
3785            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
3786            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
3787            ptr++;            ptr += 2;
3788            break;            break;
3789    
3790              default:                /* Could be name define, else bad */
3791              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3792              ptr++;                  /* Correct offset for error */
3793              *errorcodeptr = ERR24;
3794              goto FAILED;
3795            }            }
3796          break;          break;
3797    
3798    
3799            /* ------------------------------------------------------------ */
3800          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
3801          bravalue = OP_ONCE;          bravalue = OP_ONCE;
3802          ptr++;          ptr++;
3803          break;          break;
3804    
3805    
3806            /* ------------------------------------------------------------ */
3807          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
3808          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
3809          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
3810          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
3811            {                       /* closing parenthesis is present. */            {
3812            int n = 0;            int n = 0;
3813            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
3814              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
3815              if (*ptr != ')')
3816                {
3817                *errorcodeptr = ERR39;
3818                goto FAILED;
3819                }
3820            if (n > 255)            if (n > 255)
3821              {              {
3822              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 3830  for (;; ptr++)
3830          previous = NULL;          previous = NULL;
3831          continue;          continue;
3832    
3833          case 'P':                 /* Named subpattern handling */  
3834          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
3835            case 'P':                 /* Python-style named subpattern handling */
3836            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3837              {
3838              is_recurse = *ptr == '>';
3839              terminator = ')';
3840              goto NAMED_REF_OR_RECURSE;
3841              }
3842            else if (*ptr != '<')    /* Test for Python-style definition */
3843              {
3844              *errorcodeptr = ERR41;
3845              goto FAILED;
3846              }
3847            /* Fall through to handle (?P< as (?< is handled */
3848    
3849    
3850            /* ------------------------------------------------------------ */
3851            DEFINE_NAME:    /* Come here from (?< handling */
3852            case '\'':
3853            {            {
3854            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
3855            uschar *slot = cd->name_table;            name = ++ptr;
3856            const uschar *name;     /* Don't amalgamate; some compilers */  
3857            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3858              namelen = ptr - name;
3859    
3860            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
3861    
3862            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
3863                {
3864                if (*ptr != terminator)
3865                  {
3866                  *errorcodeptr = ERR42;
3867                  goto FAILED;
3868                  }
3869                if (cd->names_found >= MAX_NAME_COUNT)
3870                  {
3871                  *errorcodeptr = ERR49;
3872                  goto FAILED;
3873                  }
3874                if (namelen + 3 > cd->name_entry_size)
3875                  {
3876                  cd->name_entry_size = namelen + 3;
3877                  if (namelen > MAX_NAME_SIZE)
3878                    {
3879                    *errorcodeptr = ERR48;
3880                    goto FAILED;
3881                    }
3882                  }
3883                }
3884    
3885              /* In the real compile, create the entry in the table */
3886    
3887              else
3888              {              {
3889              int crc = memcmp(name, slot+2, namelen);              slot = cd->name_table;
3890              if (crc == 0)              for (i = 0; i < cd->names_found; i++)
3891                {                {
3892                if (slot[2+namelen] == 0)                int crc = memcmp(name, slot+2, namelen);
3893                  if (crc == 0)
3894                  {                  {
3895                  if ((options & PCRE_DUPNAMES) == 0)                  if (slot[2+namelen] == 0)
3896                    {                    {
3897                    *errorcodeptr = ERR43;                    if ((options & PCRE_DUPNAMES) == 0)
3898                    goto FAILED;                      {
3899                        *errorcodeptr = ERR43;
3900                        goto FAILED;
3901                        }
3902                    }                    }
3903                    else crc = -1;      /* Current name is substring */
3904                  }                  }
3905                else crc = -1;      /* Current name is substring */                if (crc < 0)
3906                }                  {
3907              if (crc < 0)                  memmove(slot + cd->name_entry_size, slot,
3908                {                    (cd->names_found - i) * cd->name_entry_size);
3909                memmove(slot + cd->name_entry_size, slot,                  break;
3910                  (cd->names_found - i) * cd->name_entry_size);                  }
3911                break;                slot += cd->name_entry_size;
3912                }                }
             slot += cd->name_entry_size;  
             }  
3913    
3914            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
3915            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
3916            slot[2+namelen] = 0;              slot[2+namelen] = 0;
3917            cd->names_found++;              }
           goto NUMBERED_GROUP;  
3918            }            }
3919    
3920          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
           {  
           int i, namelen;  
           int type = *ptr++;  
           const uschar *name = ptr;  
           uschar *slot = cd->name_table;  
3921    
3922            while (*ptr != ')') ptr++;          ptr++;                    /* Move past > or ' */
3923            namelen = ptr - name;          cd->names_found++;
3924            goto NUMBERED_GROUP;
3925    
3926            for (i = 0; i < cd->names_found; i++)  
3927            /* ------------------------------------------------------------ */
3928            case '&':                 /* Perl recursion/subroutine syntax */
3929            terminator = ')';
3930            is_recurse = TRUE;
3931            /* Fall through */
3932    
3933            /* We come here from the Python syntax above that handles both
3934            references (?P=name) and recursion (?P>name), as well as falling
3935            through from the Perl recursion syntax (?&name). */
3936    
3937            NAMED_REF_OR_RECURSE:
3938            name = ++ptr;
3939            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3940            namelen = ptr - name;
3941    
3942            /* In the pre-compile phase, do a syntax check and set a dummy
3943            reference number. */
3944    
3945            if (lengthptr != NULL)
3946              {
3947              if (*ptr != terminator)
3948              {              {
3949              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              *errorcodeptr = ERR42;
3950              slot += cd->name_entry_size;              goto FAILED;
3951              }              }
3952              if (namelen > MAX_NAME_SIZE)
3953            if (i < cd->names_found)         /* Back reference */              {
3954                *errorcodeptr = ERR48;
3955                goto FAILED;
3956                }
3957              recno = 0;
3958              }
3959    
3960            /* In the real compile, seek the name in the table */
3961    
3962            else
3963              {
3964              slot = cd->name_table;
3965              for (i = 0; i < cd->names_found; i++)
3966                {
3967                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3968                slot += cd->name_entry_size;
3969                }
3970    
3971              if (i < cd->names_found)         /* Back reference */
3972              {              {
3973              recno = GET2(slot, 0);              recno = GET2(slot, 0);
3974              }              }
3975            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
3976                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd->bracount, name, namelen,
3977                          (options & PCRE_EXTENDED) != 0)) <= 0)
3978              {              {
3979              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
3980              goto FAILED;              goto FAILED;
3981              }              }
3982              }
3983    
3984            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
3985            recursion or backreferences. */
           /* Back reference */  
3986    
3987            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
3988            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
3989    
         /* Should never happen */  
         break;  
3990    
3991          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
3992            case 'R':                 /* Recursion */
3993          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
3994          /* Fall through */          /* Fall through */
3995    
         /* Recursion or "subroutine" call */  
3996    
3997          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
3998          case '5': case '6': case '7': case '8': case '9':          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
3999            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4000            {            {
4001            const uschar *called;            const uschar *called;
4002            recno = 0;            recno = 0;
4003            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4004              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4005              if (*ptr != ')')
4006                {
4007                *errorcodeptr = ERR29;
4008                goto FAILED;
4009                }
4010    
4011            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4012    
4013            HANDLE_RECURSION:            HANDLE_RECURSION:
4014    
4015            previous = code;            previous = code;
4016              called = cd->start_code;
4017    
4018            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4019            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4020              this point. If we end up with a forward reference, first check that
4021              the bracket does occur later so we can give the error (and position)
4022              now. Then remember this forward reference in the workspace so it can
4023              be filled in at the end. */
4024    
4025            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);  
           if (called == NULL)  
4026              {              {
4027              *errorcodeptr = ERR15;              *code = OP_END;
4028              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4029    
4030            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4031    
4032            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4033              {                {
4034              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4035              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4036                    {
4037                    *errorcodeptr = ERR15;
4038                    goto FAILED;
4039                    }
4040                  called = cd->start_code + recno;
4041                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4042                  }
4043    
4044                /* If not a forward reference, and the subpattern is still open,
4045                this is a recursive call. We check to see if this is a left
4046                recursion that could loop for ever, and diagnose that case. */
4047    
4048                else if (GET(called, 1) == 0 &&
4049                         could_be_empty(called, code, bcptr, utf8))
4050                  {
4051                  *errorcodeptr = ERR40;
4052                  goto FAILED;
4053                  }
4054              }              }
4055    
4056            /* Insert the recursion/subroutine item, automatically wrapped inside            /* Insert the recursion/subroutine item, automatically wrapped inside
4057            "once" brackets. */            "once" brackets. Set up a "previous group" length so that a
4058              subsequent quantifier will work. */
4059    
4060            *code = OP_ONCE;            *code = OP_ONCE;
4061            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
# Line 3174  for (;; ptr++) Line 4068  for (;; ptr++)
4068            *code = OP_KET;            *code = OP_KET;
4069            PUT(code, 1, 2 + 2*LINK_SIZE);            PUT(code, 1, 2 + 2*LINK_SIZE);
4070            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4071    
4072              length_prevgroup = 3 + 3*LINK_SIZE;
4073            }            }
4074    
4075            /* Can't determine a first byte now */
4076    
4077            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4078          continue;          continue;
4079    
         /* Character after (? not specially recognized */  
4080    
4081          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4082            default:              /* Other characters: check option setting */
4083          set = unset = 0;          set = unset = 0;
4084          optset = &set;          optset = &set;
4085    
# Line 3189  for (;; ptr++) Line 4089  for (;; ptr++)
4089              {              {
4090              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4091    
4092                case 'J':    /* Record that it changed in the external options */
4093                *optset |= PCRE_DUPNAMES;
4094                cd->external_options |= PCRE_JCHANGED;
4095                break;
4096    
4097              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
             case 'J': *optset |= PCRE_DUPNAMES; break;  
4098              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4099              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4100              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4101              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4102              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4103    
4104                default:  *errorcodeptr = ERR12;
4105                          ptr--;    /* Correct the offset */
4106                          goto FAILED;
4107              }              }
4108            }            }
4109    
# Line 3204  for (;; ptr++) Line 4112  for (;; ptr++)
4112          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4113    
4114          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4115          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4116          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4117          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4118          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4119          a group), a resetting item can be compiled.          caseless checking of required bytes.
4120    
4121          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4122          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4123          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4124            that value after the start, because it gets reset as code is discarded
4125            during the pre-compile. However, this can happen only at top level - if
4126            we are within parentheses, the starting BRA will still be present. At
4127            any parenthesis level, the length value can be used to test if anything
4128            has been compiled at that level. Thus, a test for both these conditions
4129            is necessary to ensure we correctly detect the start of the pattern in
4130            both phases.
4131    
4132            If we are not at the pattern start, compile code to change the ims
4133            options if this setting actually changes any of them. We also pass the
4134            new setting back so that it can be put at the start of any following
4135            branches, and when this group ends (if we are in a group), a resetting
4136            item can be compiled. */
4137    
4138          if (*ptr == ')')          if (*ptr == ')')
4139            {            {
4140            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4141                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4142              {              {
4143              *code++ = OP_OPT;              cd->external_options = newoptions;
4144              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4145              }              }
4146             else
4147                {
4148                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4149                  {
4150                  *code++ = OP_OPT;
4151                  *code++ = newoptions & PCRE_IMS;
4152                  }
4153    
4154            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4155            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4156            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4157    
4158            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4159            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4160            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4161            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4162                }
4163    
4164            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4165            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3242  for (;; ptr++) Line 4172  for (;; ptr++)
4172    
4173          bravalue = OP_BRA;          bravalue = OP_BRA;
4174          ptr++;          ptr++;
4175          }          }     /* End of switch for character following (? */
4176        }        }       /* End of (? handling */
4177    
4178      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4179      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4180        brackets. */
4181    
4182      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4183        {        {
4184        bravalue = OP_BRA;        bravalue = OP_BRA;
4185        }        }
4186    
4187      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4188    
4189      else      else
4190        {        {
4191        NUMBERED_GROUP:        NUMBERED_GROUP:
4192        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4193          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4194          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4195        }        }
4196    
4197      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4198      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4199      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4200      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4201        they have changed. */
4202    
4203      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4204      *code = bravalue;      *code = bravalue;
4205      tempcode = code;      tempcode = code;
4206      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4207        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4208    
4209      if (!compile_regex(      if (!compile_regex(
4210           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4211           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4212           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4213           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4214           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4215           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4216            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4217           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           skipbytes,                    /* Skip over bracket number */
4218           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4219           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4220           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4221           cd))                          /* Tables block */           cd,                           /* Tables block */
4222             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4223               &length_prevgroup           /* Pre-compile phase */
4224             ))
4225        goto FAILED;        goto FAILED;
4226    
4227      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3302  for (;; ptr++) Line 4230  for (;; ptr++)
4230      is on the bracket. */      is on the bracket. */
4231    
4232      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4233      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. */
4234    
4235      else if (bravalue == OP_COND)      if (bravalue == OP_COND)
4236        {        {
4237        uschar *tc = code;        uschar *tc = code;
4238        int condcount = 0;        int condcount = 0;
# Line 3315  for (;; ptr++) Line 4243  for (;; ptr++)
4243           }           }
4244        while (*tc != OP_KET);        while (*tc != OP_KET);
4245    
4246        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4247          false). It must have only one branch. */
4248    
4249          if (code[LINK_SIZE+1] == OP_DEF)
4250          {          {
4251          *errorcodeptr = ERR27;          if (condcount > 1)
4252          goto FAILED;            {
4253              *errorcodeptr = ERR54;
4254              goto FAILED;
4255              }
4256            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4257            }
4258    
4259          /* A "normal" conditional group. If there is just one branch, we must not
4260          make use of its firstbyte or reqbyte, because this is equivalent to an
4261          empty second branch. */
4262    
4263          else
4264            {
4265            if (condcount > 2)
4266              {
4267              *errorcodeptr = ERR27;
4268              goto FAILED;
4269              }
4270            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4271          }          }
4272          }
4273    
4274        /* Error if hit end of pattern */
4275    
4276        /* If there is just one branch, we must not make use of its firstbyte or      if (*ptr != ')')
4277        reqbyte, because this is equivalent to an empty second branch. */        {
4278          *errorcodeptr = ERR14;
4279          goto FAILED;
4280          }
4281    
4282        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      /* In the pre-compile phase, update the length by the length of the nested
4283        group, less the brackets at either end. Then reduce the compiled code to
4284        just the brackets so that it doesn't use much memory if it is duplicated by
4285        a quantifier. */
4286    
4287        if (lengthptr != NULL)
4288          {
4289          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4290          code++;
4291          PUTINC(code, 0, 1 + LINK_SIZE);
4292          *code++ = OP_KET;
4293          PUTINC(code, 0, 1 + LINK_SIZE);
4294        }        }
4295    
4296      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4297      brackets of all kinds, and conditions with two branches (see code above).  
4298      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4299      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4300      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not
4301        relevant. */
4302    
4303        if (bravalue == OP_DEF) break;
4304    
4305        /* Handle updating of the required and first characters for other types of
4306        group. Update for normal brackets of all kinds, and conditions with two
4307        branches (see code above). If the bracket is followed by a quantifier with
4308        zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4309        zerofirstbyte outside the main loop so that they can be accessed for the
4310        back off. */
4311    
4312      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
4313      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
4314      groupsetfirstbyte = FALSE;      groupsetfirstbyte = FALSE;
4315    
4316      if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)      if (bravalue >= OP_ONCE)
4317        {        {
4318        /* If we have not yet set a firstbyte in this branch, take it from the        /* If we have not yet set a firstbyte in this branch, take it from the
4319        subpattern, remembering that it was set here so that a repeat of more        subpattern, remembering that it was set here so that a repeat of more
# Line 3378  for (;; ptr++) Line 4354  for (;; ptr++)
4354      firstbyte, looking for an asserted first char. */      firstbyte, looking for an asserted first char. */
4355    
4356      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;      else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4357        break;     /* End of processing '(' */
4358    
     /* Now update the main code pointer to the end of the group. */  
   
     code = tempcode;  
   
     /* Error if hit end of pattern */  
   
     if (*ptr != ')')  
       {  
       *errorcodeptr = ERR14;  
       goto FAILED;  
       }  
     break;  
   
     /* Check \ for being a real metacharacter; if not, fall through and handle  
     it as a data character at the start of a string. Escape items are checked  
     for validity in the pre-compiling pass. */  
   
     case '\\':  
     tempptr = ptr;  
     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);  
4359    
4360      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* ===================================================================*/
4361        /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4362      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
4363      back references, the values are ESC_REF plus the reference number. Only      back references, the values are ESC_REF plus the reference number. Only
4364      back references and those types that consume a character may be repeated.      back references and those types that consume a character may be repeated.
4365      We can test for values between ESC_b and ESC_Z for the latter; this may      We can test for values between ESC_b and ESC_Z for the latter; this may
4366      have to change if any new ones are ever created. */      have to change if any new ones are ever created. */
4367    
4368        case '\\':
4369        tempptr = ptr;
4370        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4371        if (*errorcodeptr != 0) goto FAILED;
4372    
4373      if (c < 0)      if (c < 0)
4374        {        {
4375        if (-c == ESC_Q)            /* Handle start of quoted string */        if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 3416  for (;; ptr++) Line 4379  for (;; ptr++)
4379          continue;          continue;
4380          }          }
4381    
4382          if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4383    
4384        /* For metasequences that actually match a character, we disable the        /* For metasequences that actually match a character, we disable the
4385        setting of a first character if it hasn't already been set. */        setting of a first character if it hasn't already been set. */
4386    
# Line 3427  for (;; ptr++) Line 4392  for (;; ptr++)
4392        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4393        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4394    
4395        /* Back references are handled specially */        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4396    
4397          if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4398            {
4399            is_recurse = FALSE;
4400            terminator = (*(++ptr) == '<')? '>' : '\'';
4401            goto NAMED_REF_OR_RECURSE;
4402            }
4403    
4404          /* Back references are handled specially; must disable firstbyte if
4405          not set to cope with cases like (?=(\w+))\1: which would otherwise set
4406          ':' later. */
4407    
4408        if (-c >= ESC_REF)        if (-c >= ESC_REF)
4409          {          {
4410          int number = -c - ESC_REF;          recno = -c - ESC_REF;
4411    
4412            HANDLE_REFERENCE:    /* Come here from named backref handling */
4413            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4414          previous = code;          previous = code;
4415          *code++ = OP_REF;          *code++ = OP_REF;
4416          PUT2INC(code, 0, number);          PUT2INC(code, 0, recno);
4417            cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4418            if (recno > cd->top_backref) cd->top_backref = recno;
4419          }          }
4420    
4421        /* So are Unicode property matches, if supported. We know that get_ucp        /* So are Unicode property matches, if supported. */
       won't fail because it was tested in the pre-pass. */  
4422    
4423  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
4424        else if (-c == ESC_P || -c == ESC_p)        else if (-c == ESC_P || -c == ESC_p)
# Line 3446  for (;; ptr++) Line 4426  for (;; ptr++)
4426          BOOL negated;          BOOL negated;
4427          int pdata;          int pdata;
4428          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);          int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4429            if (ptype < 0) goto FAILED;
4430          previous = code;          previous = code;
4431          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;          *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4432          *code++ = ptype;          *code++ = ptype;
4433          *code++ = pdata;          *code++ = pdata;
4434          }          }
4435    #else
4436    
4437          /* If Unicode properties are not supported, \X, \P, and \p are not
4438          allowed. */
4439    
4440          else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4441            {
4442            *errorcodeptr = ERR45;
4443            goto FAILED;
4444            }
4445  #endif  #endif
4446    
4447        /* For the rest, we can obtain the OP value by negating the escape        /* For the rest (including \X when Unicode properties are supported), we
4448        value */        can obtain the OP value by negating the escape value. */
4449    
4450        else        else
4451          {          {
# Line 3478  for (;; ptr++) Line 4469  for (;; ptr++)
4469       mcbuffer[0] = c;       mcbuffer[0] = c;
4470       mclength = 1;       mclength = 1;
4471       }       }
   
4472      goto ONE_CHAR;      goto ONE_CHAR;
4473    
4474    
4475        /* ===================================================================*/
4476      /* Handle a literal character. It is guaranteed not to be whitespace or #      /* Handle a literal character. It is guaranteed not to be whitespace or #
4477      when the extended flag is set. If we are in UTF-8 mode, it may be a      when the extended flag is set. If we are in UTF-8 mode, it may be a
4478      multi-byte literal character. */      multi-byte literal character. */
# Line 3491  for (;; ptr++) Line 4483  for (;; ptr++)
4483      mcbuffer[0] = c;      mcbuffer[0] = c;
4484    
4485  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4486      if (utf8 && (c & 0xc0) == 0xc0)      if (utf8 && c >= 0xc0)
4487        {        {
4488        while ((ptr[1] & 0xc0) == 0x80)        while ((ptr[1] & 0xc0) == 0x80)
4489          mcbuffer[mclength++] = *(++ptr);          mcbuffer[mclength++] = *(++ptr);
# Line 3542  for (;; ptr++) Line 4534  for (;; ptr++)
4534      }      }
4535    }                   /* end of big loop */    }                   /* end of big loop */
4536    
4537    
4538  /* Control never reaches here by falling through, only by a goto for all the  /* Control never reaches here by falling through, only by a goto for all the
4539  error states. Pass back the position in the pattern so that it can be displayed  error states. Pass back the position in the pattern so that it can be displayed
4540  to the user for diagnosing the error. */  to the user for diagnosing the error. */
# Line 3558  return FALSE; Line 4551  return FALSE;
4551  *     Compile sequence of alternatives           *  *     Compile sequence of alternatives           *
4552  *************************************************/  *************************************************/
4553    
4554  /* On entry, ptr is pointing past the bracket character, but on return  /* On entry, ptr is pointing past the bracket character, but on return it
4555  it points to the closing bracket, or vertical bar, or end of string.  points to the closing bracket, or vertical bar, or end of string. The code
4556  The code variable is pointing at the byte into which the BRA operator has been  variable is pointing at the byte into which the BRA operator has been stored.
4557  stored. If the ims options are changed at the start (for a (?ims: group) or  If the ims options are changed at the start (for a (?ims: group) or during any
4558  during any branch, we need to insert an OP_OPT item at the start of every  branch, we need to insert an OP_OPT item at the start of every following branch
4559  following branch to ensure they get set correctly at run time, and also pass  to ensure they get set correctly at run time, and also pass the new options
4560  the new options into every subsequent branch compile.  into every subsequent branch compile.
4561    
4562    This function is used during the pre-compile phase when we are trying to find
4563    out the amount of memory needed, as well as during the real compile phase. The
4564    value of lengthptr distinguishes the two phases.
4565    
4566  Argument:  Argument:
4567    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
4568    oldims         previous settings of ims option bits    oldims         previous settings of ims option bits
   brackets       -> int containing the number of extracting brackets used  
4569    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
4570    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
4571    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
4572    lookbehind     TRUE if this is a lookbehind assertion    lookbehind     TRUE if this is a lookbehind assertion
4573    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)    skipbytes      skip this many bytes at start (for brackets and OP_COND)
4574    firstbyteptr   place to put the first required character, or a negative number    firstbyteptr   place to put the first required character, or a negative number
4575    reqbyteptr     place to put the last required character, or a negative number    reqbyteptr     place to put the last required character, or a negative number
4576    bcptr          pointer to the chain of currently open branches    bcptr          pointer to the chain of currently open branches
4577    cd             points to the data block with tables pointers etc.    cd             points to the data block with tables pointers etc.
4578      lengthptr      NULL during the real compile phase
4579                     points to length accumulator during pre-compile phase
4580    
4581  Returns:      TRUE on success  Returns:         TRUE on success
4582  */  */
4583    
4584  static BOOL  static BOOL
4585  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4586    const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,    int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4587    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4588  {  {
4589  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
4590  uschar *code = *codeptr;  uschar *code = *codeptr;
# Line 3595  uschar *start_bracket = code; Line 4593  uschar *start_bracket = code;
4593  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
4594  int firstbyte, reqbyte;  int firstbyte, reqbyte;
4595  int branchfirstbyte, branchreqbyte;  int branchfirstbyte, branchreqbyte;
4596    int length;
4597  branch_chain bc;  branch_chain bc;
4598    
4599  bc.outer = bcptr;  bc.outer = bcptr;
# Line 3602  bc.current = code; Line 4601  bc.current = code;
4601    
4602  firstbyte = reqbyte = REQ_UNSET;  firstbyte = reqbyte = REQ_UNSET;
4603    
4604    /* Accumulate the length for use in the pre-compile phase. Start with the
4605    length of the BRA and KET and any extra bytes that are required at the
4606    beginning. We accumulate in a local variable to save frequent testing of
4607    lenthptr for NULL. We cannot do this by looking at the value of code at the
4608    start and end of each alternative, because compiled items are discarded during
4609    the pre-compile phase so that the work space is not exceeded. */
4610    
4611    length = 2 + 2*LINK_SIZE + skipbytes;
4612    
4613    /* WARNING: If the above line is changed for any reason, you must also change
4614    the code that abstracts option settings at the start of the pattern and makes
4615    them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4616    pre-compile phase to find out whether anything has yet been compiled or not. */
4617    
4618  /* Offset is set zero to mark that this bracket is still open */  /* Offset is set zero to mark that this bracket is still open */
4619    
4620  PUT(code, 1, 0);  PUT(code, 1, 0);
# Line 3617  for (;;) Line 4630  for (;;)
4630      {      {
4631      *code++ = OP_OPT;      *code++ = OP_OPT;
4632      *code++ = options & PCRE_IMS;      *code++ = options & PCRE_IMS;
4633        length += 2;
4634      }      }
4635    
4636    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 3626  for (;;) Line 4640  for (;;)
4640      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
4641      reverse_count = code;      reverse_count = code;
4642      PUTINC(code, 0, 0);      PUTINC(code, 0, 0);
4643        length += 1 + LINK_SIZE;
4644      }      }
4645    
4646    /* Now compile the branch */    /* Now compile the branch; in the pre-compile phase its length gets added
4647      into the length. */
4648    
4649    if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,    if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4650          &branchfirstbyte, &branchreqbyte, &bc, cd))          &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4651      {      {
4652      *ptrptr = ptr;      *ptrptr = ptr;
4653      return FALSE;      return FALSE;
4654      }      }
4655    
4656    /* If this is the first branch, the firstbyte and reqbyte values for the    /* In the real compile phase, there is some post-processing to be done. */
   branch become the values for the regex. */  
4657    
4658    if (*last_branch != OP_ALT)    if (lengthptr == NULL)
4659      {      {
4660      firstbyte = branchfirstbyte;      /* If this is the first branch, the firstbyte and reqbyte values for the
4661      reqbyte = branchreqbyte;      branch become the values for the regex. */
     }  
4662    
4663    /* If this is not the first branch, the first char and reqbyte have to      if (*last_branch != OP_ALT)
4664    match the values from all the previous branches, except that if the previous        {
4665    value for reqbyte didn't have REQ_VARY set, it can still match, and we set        firstbyte = branchfirstbyte;
4666    REQ_VARY for the regex. */        reqbyte = branchreqbyte;
4667          }
4668    
4669    else      /* If this is not the first branch, the first char and reqbyte have to
4670      {      match the values from all the previous branches, except that if the
4671      /* If we previously had a firstbyte, but it doesn't match the new branch,      previous value for reqbyte didn't have REQ_VARY set, it can still match,
4672      we have to abandon the firstbyte for the regex, but if there was previously      and we set REQ_VARY for the regex. */
     no reqbyte, it takes on the value of the old firstbyte. */  
4673    
4674      if (firstbyte >= 0 && firstbyte != branchfirstbyte)      else
4675        {        {
4676        if (reqbyte < 0) reqbyte = firstbyte;        /* If we previously had a firstbyte, but it doesn't match the new branch,
4677        firstbyte = REQ_NONE;        we have to abandon the firstbyte for the regex, but if there was
4678        }        previously no reqbyte, it takes on the value of the old firstbyte. */
4679    
4680          if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4681            {
4682            if (reqbyte < 0) reqbyte = firstbyte;
4683            firstbyte = REQ_NONE;
4684            }
4685    
4686      /* If we (now or from before) have no firstbyte, a firstbyte from the        /* If we (now or from before) have no firstbyte, a firstbyte from the
4687      branch becomes a reqbyte if there isn't a branch reqbyte. */        branch becomes a reqbyte if there isn't a branch reqbyte. */
4688    
4689      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)        if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4690          branchreqbyte = branchfirstbyte;            branchreqbyte = branchfirstbyte;
4691    
4692      /* Now ensure that the reqbytes match */        /* Now ensure that the reqbytes match */
4693    
4694      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))        if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4695        reqbyte = REQ_NONE;          reqbyte = REQ_NONE;
4696      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */        else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4697      }        }
4698    
4699    /* If lookbehind, check that this branch matches a fixed-length string,      /* If lookbehind, check that this branch matches a fixed-length string, and
4700    and put the length into the OP_REVERSE item. Temporarily mark the end of      put the length into the OP_REVERSE item. Temporarily mark the end of the
4701    the branch with OP_END. */      branch with OP_END. */
4702    
4703    if (lookbehind)      if (lookbehind)
     {  
     int length;  
     *code = OP_END;  
     length = find_fixedlength(last_branch, options);  
     DPRINTF(("fixed length = %d\n", length));  
     if (length < 0)  
4704        {        {
4705        *errorcodeptr = (length == -2)? ERR36 : ERR25;        int fixed_length;
4706        *ptrptr = ptr;        *code = OP_END;
4707        return FALSE;        fixed_length = find_fixedlength(last_branch, options);
4708          DPRINTF(("fixed length = %d\n", fixed_length));
4709          if (fixed_length < 0)
4710            {
4711            *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4712            *ptrptr = ptr;
4713            return FALSE;
4714            }
4715          PUT(reverse_count, 0, fixed_length);
4716        }        }
     PUT(reverse_count, 0, length);  
4717      }      }
4718    
4719    /* Reached end of expression, either ')' or end of pattern. Go back through    /* Reached end of expression, either ')' or end of pattern. Go back through
# Line 3706  for (;;) Line 4727  for (;;)
4727    
4728    if (*ptr != '|')    if (*ptr != '|')
4729      {      {
4730      int length = code - last_branch;      int branch_length = code - last_branch;
4731      do      do
4732        {        {
4733        int prev_length = GET(last_branch, 1);        int prev_length = GET(last_branch, 1);
4734        PUT(last_branch, 1, length);        PUT(last_branch, 1, branch_length);
4735        length = prev_length;        branch_length = prev_length;
4736        last_branch -= length;        last_branch -= branch_length;
4737        }        }
4738      while (length > 0);      while (branch_length > 0);
4739    
4740      /* Fill in the ket */      /* Fill in the ket */
4741    
# Line 3728  for (;;) Line 4749  for (;;)
4749        {        {
4750        *code++ = OP_OPT;        *code++ = OP_OPT;
4751        *code++ = oldims;        *code++ = oldims;
4752          length += 2;
4753        }        }
4754    
4755      /* Set values to pass back */      /* Set values to pass back */
# Line 3736  for (;;) Line 4758  for (;;)
4758      *ptrptr = ptr;      *ptrptr = ptr;
4759      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
4760      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
4761        if (lengthptr != NULL) *lengthptr += length;
4762      return TRUE;      return TRUE;
4763      }      }
4764    
# Line 3749  for (;;) Line 4772  for (;;)
4772    bc.current = last_branch = code;    bc.current = last_branch = code;
4773    code += 1 + LINK_SIZE;    code += 1 + LINK_SIZE;
4774    ptr++;    ptr++;
4775      length += 1 + LINK_SIZE;
4776    }    }
4777  /* Control never reaches here */  /* Control never reaches here */
4778  }  }
# Line 3799  is_anchored(register const uschar *code, Line 4823  is_anchored(register const uschar *code,
4823    unsigned int backref_map)    unsigned int backref_map)
4824  {  {
4825  do {  do {
4826     const uschar *scode =     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4827       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);       options, PCRE_MULTILINE, FALSE);
4828     register int op = *scode;     register int op = *scode;
4829    
4830       /* Non-capturing brackets */
4831    
4832       if (op == OP_BRA)
4833         {
4834         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4835         }
4836    
4837     /* Capturing brackets */     /* Capturing brackets */
4838    
4839     if (op > OP_BRA)     else if (op == OP_CBRA)
4840       {       {
4841       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4842       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4843       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4844       }       }
4845    
4846     /* Other brackets */     /* Other brackets */
4847    
4848     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4849       {       {
4850       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4851       }       }
# Line 3824  do { Line 4853  do {
4853     /* .* is not anchored unless DOTALL is set and it isn't in brackets that     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4854     are or may be referenced. */     are or may be referenced. */
4855    
4856     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4857                 op == OP_TYPEPOSSTAR) &&
4858              (*options & PCRE_DOTALL) != 0)              (*options & PCRE_DOTALL) != 0)
4859       {       {
4860       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
# Line 3869  is_startline(const uschar *code, unsigne Line 4899  is_startline(const uschar *code, unsigne
4899    unsigned int backref_map)    unsigned int backref_map)
4900  {  {
4901  do {  do {
4902     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4903       FALSE);       NULL, 0, FALSE);
4904     register int op = *scode;     register int op = *scode;
4905    
4906       /* Non-capturing brackets */
4907    
4908       if (op == OP_BRA)
4909         {
4910         if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4911         }
4912    
4913     /* Capturing brackets */     /* Capturing brackets */
4914    
4915     if (op > OP_BRA)     else if (op == OP_CBRA)
4916       {       {
4917       int new_map;       int n = GET2(scode, 1+LINK_SIZE);
4918       op -= OP_BRA;       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);  
      new_map = bracket_map | ((op < 32)? (1 << op) : 1);  
4919       if (!is_startline(scode, new_map, backref_map)) return FALSE;       if (!is_startline(scode, new_map, backref_map)) return FALSE;
4920       }       }
4921    
4922     /* Other brackets */     /* Other brackets */
4923    
4924     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4925       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }       { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4926    
4927     /* .* means "start at start or after \n" if it isn't in brackets that     /* .* means "start at start or after \n" if it isn't in brackets that
4928     may be referenced. */     may be referenced. */
4929    
4930     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4931       {       {
4932       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;       if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4933       }       }
# Line 3941  do { Line 4976  do {
4976       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4977     register int op = *scode;     register int op = *scode;
4978    
    if (op >= OP_BRA) op = OP_BRA;  
   
4979     switch(op)     switch(op)
4980       {       {
4981       default:       default:
4982       return -1;       return -1;
4983    
4984       case OP_BRA:       case OP_BRA:
4985         case OP_CBRA:
4986       case OP_ASSERT:       case OP_ASSERT:
4987       case OP_ONCE:       case OP_ONCE:
4988       case OP_COND:       case OP_COND:
# Line 3964  do { Line 4998  do {
4998       case OP_CHARNC:       case OP_CHARNC:
4999       case OP_PLUS:       case OP_PLUS:
5000       case OP_MINPLUS:       case OP_MINPLUS:
5001         case OP_POSPLUS:
5002       if (!inassert) return -1;       if (!inassert) return -1;
5003       if (c < 0)       if (c < 0)
5004         {         {
# Line 4012  return pcre_compile2(pattern, options, N Line 5047  return pcre_compile2(pattern, options, N
5047  }  }
5048    
5049    
   
5050  PCRE_DATA_SCOPE pcre *  PCRE_DATA_SCOPE pcre *
5051  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5052    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5053  {  {
5054  real_pcre *re;  real_pcre *re;
5055  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1;  /* For final END opcode */
5056  int c, firstbyte, reqbyte, newline;  int firstbyte, reqbyte, newline;
 int bracount = 0;  
 int branch_extra = 0;  
 int branch_newextra;  
 int item_count = -1;  
 int name_count = 0;  
 int max_name_size = 0;  
 int lastitemlength = 0;  
5057  int errorcode = 0;  int errorcode = 0;
5058  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
5059  BOOL utf8;  BOOL utf8;
 BOOL class_utf8;  
5060  #endif  #endif
 BOOL inescq = FALSE;  
 BOOL capturing;  
 unsigned int brastackptr = 0;  
5061  size_t size;  size_t size;
5062  uschar *code;  uschar *code;
5063  const uschar *codestart;  const uschar *codestart;
5064  const uschar *ptr;  const uschar *ptr;
5065  compile_data compile_block;  compile_data compile_block;
5066  compile_data *cd = &compile_block;  compile_data *cd = &compile_block;
5067  int brastack[BRASTACK_SIZE];  
5068  uschar bralenstack[BRASTACK_SIZE];  /* This space is used for "compiling" into during the first phase, when we are
5069    computing the amount of memory that is needed. Compiled items are thrown away
5070    as soon as possible, so that a fairly large buffer should be sufficient for
5071    this purpose. The same space is used in the second phase for remembering where
5072    to fill in forward references to subpatterns. */
5073    
5074    uschar cworkspace[COMPILE_WORK_SIZE];
5075    
5076    
5077    /* Set this early so that early errors get offset 0. */
5078    
5079    ptr = (const uschar *)pattern;
5080    
5081  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
5082  can do is just return NULL, but we can set a code value if there is a code  can do is just return NULL, but we can set a code value if there is a code
# Line 4075  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5109  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5109       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5110    {    {
5111    errorcode = ERR44;    errorcode = ERR44;
5112    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_UTF8_ERROR_RETURN;
5113    }    }
5114  #else  #else
5115  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 4099  cd->fcc = tables + fcc_offset; Line 5133  cd->fcc = tables + fcc_offset;
5133  cd->cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
5134  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5135    
5136  /* Handle different types of newline. The two bits give four cases. The current  /* Handle different types of newline. The three bits give seven cases. The
5137  code allows for one- or two-byte sequences. */  current code allows for fixed one- or two-byte sequences, plus "any". */
5138    
5139  switch (options & PCRE_NEWLINE_CRLF)  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5140    {    {
5141    default:              newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
5142    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = '\r'; break;
5143    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = '\n'; break;
5144    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5145         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5146      case PCRE_NEWLINE_ANY: newline = -1; break;
5147      default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5148    }    }
5149    
5150  if (newline > 255)  if (newline < 0)
5151    {    {
5152    cd->nllen = 2;    cd->nltype = NLTYPE_ANY;
   cd->nl[0] = (newline >> 8) & 255;  
   cd->nl[1] = newline & 255;  
5153    }    }
5154  else  else
5155    {    {
5156    cd->nllen = 1;    cd->nltype = NLTYPE_FIXED;
5157    cd->nl[0] = newline;    if (newline > 255)
5158        {
5159        cd->nllen = 2;
5160        cd->nl[0] = (newline >> 8) & 255;
5161        cd->nl[1] = newline & 255;
5162        }
5163      else
5164        {
5165        cd->nllen = 1;
5166        cd->nl[0] = newline;
5167        }
5168    }    }
5169    
5170  /* Maximum back reference and backref bitmap. This is updated for numeric  /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5171  references during the first pass, but for named references during the actual  references to help in deciding whether (.*) can be treated as anchored or not.
5172  compile pass. The bitmap records up to 31 back references to help in deciding  */
 whether (.*) can be treated as anchored or not. */  
5173    
5174  cd->top_backref = 0;  cd->top_backref = 0;
5175  cd->backref_map = 0;  cd->backref_map = 0;
# Line 4136  cd->backref_map = 0; Line 5179  cd->backref_map = 0;
5179  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
5180  DPRINTF(("%s\n", pattern));  DPRINTF(("%s\n", pattern));
5181    
5182  /* The first thing to do is to make a pass over the pattern to compute the  /* Pretend to compile the pattern while actually just accumulating the length
5183  amount of store required to hold the compiled code. This does not have to be  of memory required. This behaviour is triggered by passing a non-NULL final
5184  perfect as long as errors are overestimates. At the same time we can detect any  argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5185  flag settings right at the start, and extract them. Make an attempt to correct  to compile parts of the pattern into; the compiled code is discarded when it is
5186  for any counted white space if an "extended" flag setting appears late in the  no longer needed, so hopefully this workspace will never overflow, though there
5187  pattern. We can't be so clever for #-comments. */  is a test for its doing so. */
   
 ptr = (const uschar *)(pattern - 1);  
 while ((c = *(++ptr)) != 0)  
   {  
   int min, max;  
   int class_optcount;  
   int bracket_length;  
   int duplength;  
5188    
5189    /* If we are inside a \Q...\E sequence, all chars are literal */  cd->bracount = 0;
5190    cd->names_found = 0;
5191    cd->name_entry_size = 0;
5192    cd->name_table = NULL;
5193    cd->start_workspace = cworkspace;
5194    cd->start_code = cworkspace;
5195    cd->hwm = cworkspace;
5196    cd->start_pattern = (const uschar *)pattern;
5197    cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5198    cd->req_varyopt = 0;
5199    cd->nopartial = FALSE;
5200    cd->external_options = options;
5201    
5202    if (inescq)  /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5203      {  don't need to look at the result of the function here. The initial options have
5204      if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;  been put into the cd block so that they can be changed if an option setting is
5205      goto NORMAL_CHAR;  found within the regex right at the beginning. Bringing initial option settings
5206      }  outside can help speed up starting point checks. */
5207    
5208    /* Otherwise, first check for ignored whitespace and comments */  code = cworkspace;
5209    *code = OP_BRA;
5210    (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5211      &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5212    if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5213    
5214    if ((options & PCRE_EXTENDED) != 0)  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5215      {    cd->hwm - cworkspace));
     if ((cd->ctypes[c] & ctype_space) != 0) continue;  
     if (c == '#')  
       {  
       while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;  
       if (*ptr != 0)  
         {  
         ptr += cd->nllen - 1;  
         continue;  
         }  
       break;    /* End loop at end of pattern */  
       }  
     }  
5216    
5217    item_count++;    /* Is zero for the first non-comment item */  if (length > MAX_PATTERN_SIZE)
5218      {
5219      errorcode = ERR20;
5220      goto PCRE_EARLY_ERROR_RETURN;
5221      }
5222    
5223    /* Allow space for auto callout before every item except quantifiers. */  /* Compute the size of data block needed and get it, either from malloc or
5224    externally provided function. Integer overflow should no longer be possible
5225    because nowadays we limit the maximum value of cd->names_found and
5226    cd->name_entry_size. */
5227    
5228    if ((options & PCRE_AUTO_CALLOUT) != 0 &&  size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5229         c != '*' && c != '+' && c != '?' &&  re = (real_pcre *)(pcre_malloc)(size);
        (c != '{' || !is_counted_repeat(ptr + 1)))  
     length += 2 + 2*LINK_SIZE;  
5230    
5231    switch(c)  if (re == NULL)
5232      {    {
5233      /* A backslashed item may be an escaped data character or it may be a    errorcode = ERR21;
5234      character type. */    goto PCRE_EARLY_ERROR_RETURN;
5235      }
5236    
5237      case '\\':  /* Put in the magic number, and save the sizes, initial options, and character
5238      c = check_escape(&ptr, &errorcode, bracount, options, FALSE);  table pointer. NULL is used for the default character tables. The nullpad field
5239      if (errorcode != 0) goto PCRE_ERROR_RETURN;  is at the end; it's there to help in the case when a regex compiled on a system
5240    with 4-byte pointers is run on another with 8-byte pointers. */
5241    
5242      lastitemlength = 1;     /* Default length of last item for repeats */  re->magic_number = MAGIC_NUMBER;
5243    re->size = size;
5244    re->options = cd->external_options;
5245    re->dummy1 = 0;
5246    re->first_byte = 0;
5247    re->req_byte = 0;
5248    re->name_table_offset = sizeof(real_pcre);
5249    re->name_entry_size = cd->name_entry_size;
5250    re->name_count = cd->names_found;
5251    re->ref_count = 0;
5252    re->tables = (tables == _pcre_default_tables)? NULL : tables;
5253    re->nullpad = NULL;
5254    
5255      if (c >= 0)             /* Data character */  /* The starting points of the name/number translation table and of the code are
5256        {  passed around in the compile data block. The start/end pattern and initial
5257        length += 2;          /* For a one-byte character */  options are already set from the pre-compile phase, as is the name_entry_size
5258    field. Reset the bracket count and the names_found field. Also reset the hwm
5259    field; this time it's used for remembering forward references to subpatterns.
5260    */
5261    
5262  #ifdef SUPPORT_UTF8  cd->bracount = 0;
5263        if (utf8 && c > 127)  cd->names_found = 0;
5264          {  cd->name_table = (uschar *)re + re->name_table_offset;
5265          int i;  codestart = cd->name_table + re->name_entry_size * re->name_count;
5266          for (i = 0; i < _pcre_utf8_table1_size; i++)  cd->start_code = codestart;
5267            if (c <= _pcre_utf8_table1[i]) break;  cd->hwm = cworkspace;
5268          length += i;  cd->req_varyopt = 0;
5269          lastitemlength += i;  cd->nopartial = FALSE;
         }  
 #endif  
5270    
5271        continue;  /* Set up a starting, non-extracting bracket, then compile the expression. On
5272        }  error, errorcode will be set non-zero, so we don't need to look at the result
5273    of the function here. */
5274    
5275      /* If \Q, enter "literal" mode */  ptr = (const uschar *)pattern;
5276    code = (uschar *)codestart;
5277    *code = OP_BRA;
5278    (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5279      &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5280    re->top_bracket = cd->bracount;
5281    re->top_backref = cd->top_backref;
5282    
5283      if (-c == ESC_Q)  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
       {  
       inescq = TRUE;  
       continue;  
       }  
5284    
5285      /* \X is supported only if Unicode property support is compiled */  /* If not reached end of pattern on success, there's an excess bracket. */
5286    
5287  #ifndef SUPPORT_UCP  if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
     if (-c == ESC_X)  
       {  
       errorcode = ERR45;  
       goto PCRE_ERROR_RETURN;  
       }  
 #endif  
5288    
5289      /* \P and \p are for Unicode properties, but only when the support has  /* Fill in the terminating state and check for disastrous overflow, but
5290      been compiled. Each item needs 3 bytes. */  if debugging, leave the test till after things are printed out. */
5291    
5292      else if (-c == ESC_P || -c == ESC_p)  *code++ = OP_END;
5293        {  
5294  #ifdef SUPPORT_UCP  #ifndef DEBUG
5295        BOOL negated;  if (code - codestart > length) errorcode = ERR23;
       BOOL pdata;  
       length += 3;  
       lastitemlength = 3;  
       if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)  
         goto PCRE_ERROR_RETURN;  
       continue;  
 #else  
       errorcode = ERR45;  
       goto PCRE_ERROR_RETURN;  
5296  #endif  #endif
       }  
5297    
5298      /* Other escapes need one byte */  /* Fill in any forward references that are required. */
5299    
5300      length++;  while (errorcode == 0 && cd->hwm > cworkspace)
5301      {
5302      int offset, recno;
5303      const uschar *groupptr;
5304      cd->hwm -= LINK_SIZE;
5305      offset = GET(cd->hwm, 0);
5306      recno = GET(codestart, offset);
5307      groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5308     &nbs