/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 90 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd            /* The block containing newline information */
46  #include "pcre_internal.h"  #include "pcre_internal.h"
47    
48    
# Line 190  static const char *error_texts[] = { Line 191  static const char *error_texts[] = {
191    "unrecognized character after (?<",    "unrecognized character after (?<",
192    /* 25 */    /* 25 */
193    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
194    "malformed number after (?(",    "malformed number or name after (?(",
195    "conditional group contains more than two branches",    "conditional group contains more than two branches",
196    "assertion expected after (?(",    "assertion expected after (?(",
197    "(?R or (?digits must be followed by )",    "(?R or (?digits must be followed by )",
# Line 210  static const char *error_texts[] = { Line 211  static const char *error_texts[] = {
211    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
212    "unrecognized character after (?P",    "unrecognized character after (?P",
213    "syntax error after (?P",    "syntax error after (?P",
214    "two named groups have the same name",    "two named subpatterns have the same name",
215    "invalid UTF-8 string",    "invalid UTF-8 string",
216    /* 45 */    /* 45 */
217    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
218    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
219    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
220      "subpattern name is too long (maximum 32 characters)",
221      "too many named subpatterns (maximum 10,000)",
222      /* 50 */
223      "repeated subpattern is too long",
224      "octal value is greater than \\377 (not in UTF-8 mode)"
225  };  };
226    
227    
# Line 460  else Line 466  else
466        }        }
467    
468      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
469      larger first octal digit. */      larger first octal digit. The original code used just to take the least
470        significant 8 bits of octal numbers (I think this is what early Perls used
471        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
472        than 3 octal digits. */
473    
474      case '0':      case '0':
475      c -= '0';      c -= '0';
476      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
477          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
478      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
479      break;      break;
480    
481      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 763  return p; Line 772  return p;
772    
773    
774  /*************************************************  /*************************************************
775    *     Find forward referenced named subpattern   *
776    *************************************************/
777    
778    /* This function scans along a pattern looking for capturing subpatterns, and
779    counting them. If it finds a named pattern that matches the name it is given,
780    it returns its number. This is used for forward references to named
781    subpatterns. We know that if (?P< is encountered, the name will be terminated
782    by '>' because that is checked in the first pass.
783    
784    Arguments:
785      pointer      current position in the pattern
786      count        current count of capturing parens
787      name         name to seek
788      namelen      name length
789    
790    Returns:       the number of the named subpattern, or -1 if not found
791    */
792    
793    static int
794    find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
795    {
796    const uschar *thisname;
797    for (; *ptr != 0; ptr++)
798      {
799      if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
800      if (*ptr != '(') continue;
801      if (ptr[1] != '?') { count++; continue; }
802      if (ptr[2] == '(') { ptr += 2; continue; }
803      if (ptr[2] != 'P' || ptr[3] != '<') continue;
804      count++;
805      ptr += 4;
806      thisname = ptr;
807      while (*ptr != '>') ptr++;
808      if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)
809        return count;
810      }
811    return -1;
812    }
813    
814    
815    
816    /*************************************************
817  *      Find first significant op code            *  *      Find first significant op code            *
818  *************************************************/  *************************************************/
819    
# Line 917  for (;;) Line 968  for (;;)
968    
969      case OP_CHAR:      case OP_CHAR:
970      case OP_CHARNC:      case OP_CHARNC:
971        case OP_NOT:
972      branchlength++;      branchlength++;
973      cc += 2;      cc += 2;
974  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1031  Returns: pointer to the opcode for Line 1083  Returns: pointer to the opcode for
1083  static const uschar *  static const uschar *
1084  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1085  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1086  for (;;)  for (;;)
1087    {    {
1088    register int c = *code;    register int c = *code;
1089    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1090    
1091      /* XCLASS is used for classes that cannot be represented just by a bit
1092      map. This includes negated single high-valued characters. The length in
1093      the table is zero; the actual length is stored in the compiled code. */
1094    
1095      if (c == OP_XCLASS) code += GET(code, 1);
1096    
1097      /* Handle bracketed group */
1098    
1099    else if (c > OP_BRA)    else if (c > OP_BRA)
1100      {      {
1101      int n = c - OP_BRA;      int n = c - OP_BRA;
# Line 1046  for (;;) Line 1103  for (;;)
1103      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1104      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[OP_BRA];
1105      }      }
1106    
1107      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1108      that are followed by a character may be followed by a multi-byte character.
1109      The length in the table is a minimum, so we have to scan along to skip the
1110      extra bytes. All opcodes are less than 128, so we can use relatively
1111      efficient code. */
1112    
1113    else    else
1114      {      {
1115      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
 #ifdef SUPPORT_UTF8  
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1116      if (utf8) switch(c)      if (utf8) switch(c)
1117        {        {
1118        case OP_CHAR:        case OP_CHAR:
# Line 1072  for (;;) Line 1128  for (;;)
1128        case OP_MINQUERY:        case OP_MINQUERY:
1129        while ((*code & 0xc0) == 0x80) code++;        while ((*code & 0xc0) == 0x80) code++;
1130        break;        break;
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
       break;  
1131        }        }
 #endif  
1132      }      }
1133    }    }
1134  }  }
# Line 1105  Returns: pointer to the opcode for Line 1152  Returns: pointer to the opcode for
1152  static const uschar *  static const uschar *
1153  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1154  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1155  for (;;)  for (;;)
1156    {    {
1157    register int c = *code;    register int c = *code;
1158    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1159    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1160    
1161      /* XCLASS is used for classes that cannot be represented just by a bit
1162      map. This includes negated single high-valued characters. The length in
1163      the table is zero; the actual length is stored in the compiled code. */
1164    
1165      if (c == OP_XCLASS) code += GET(code, 1);
1166    
1167      /* All bracketed groups have the same length. */
1168    
1169    else if (c > OP_BRA)    else if (c > OP_BRA)
1170      {      {
1171      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[OP_BRA];
1172      }      }
1173    
1174      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1175      that are followed by a character may be followed by a multi-byte character.
1176      The length in the table is a minimum, so we have to scan along to skip the
1177      extra bytes. All opcodes are less than 128, so we can use relatively
1178      efficient code. */
1179    
1180    else    else
1181      {      {
1182      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
 #ifdef SUPPORT_UTF8  
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1183      if (utf8) switch(c)      if (utf8) switch(c)
1184        {        {
1185        case OP_CHAR:        case OP_CHAR:
# Line 1144  for (;;) Line 1195  for (;;)
1195        case OP_MINQUERY:        case OP_MINQUERY:
1196        while ((*code & 0xc0) == 0x80) code++;        while ((*code & 0xc0) == 0x80) code++;
1197        break;        break;
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
       break;  
1198        }        }
 #endif  
1199      }      }
1200    }    }
1201  }  }
# Line 1569  int greedy_default, greedy_non_default; Line 1611  int greedy_default, greedy_non_default;
1611  int firstbyte, reqbyte;  int firstbyte, reqbyte;
1612  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
1613  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
1614  int options = *optionsptr;  int options = *optionsptr;
1615  int after_manual_callout = 0;  int after_manual_callout = 0;
1616  register int c;  register int c;
# Line 1683  for (;; ptr++) Line 1724  for (;; ptr++)
1724      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
1725      if (c == '#')      if (c == '#')
1726        {        {
1727        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
1728        on the Macintosh. */        if (*ptr != 0)
1729        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          {
1730        if (c != 0) continue;   /* Else fall through to handle end of string */          ptr += cd->nllen - 1;
1731            continue;
1732            }
1733          /* Else fall through to handle end of string */
1734          c = 0;
1735        }        }
1736      }      }
1737    
# Line 2851  for (;; ptr++) Line 2896  for (;; ptr++)
2896          case '(':          case '(':
2897          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
2898    
2899          /* Condition to test for recursion */          /* A condition can be a number, referring to a numbered group, a name,
2900            referring to a named group, 'R', referring to recursion, or an
2901            assertion. There are two unfortunate ambiguities, caused by history.
2902            (a) 'R' can be the recursive thing or the name 'R', and (b) a number
2903            could be a name that consists of digits. In both cases, we look for a
2904            name first; if not found, we try the other cases. If the first
2905            character after (?( is a word character, we know the rest up to ) will
2906            also be word characters because the syntax was checked in the first
2907            pass. */
2908    
2909          if (ptr[1] == 'R')          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
2910            {            {
2911            code[1+LINK_SIZE] = OP_CREF;            int i, namelen;
2912            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            int condref = 0;
2913              const uschar *name;
2914              uschar *slot = cd->name_table;
2915    
2916              /* This is needed for all successful cases. */
2917    
2918            skipbytes = 3;            skipbytes = 3;
           ptr += 3;  
           }  
2919    
2920          /* Condition to test for a numbered subpattern match. We know that            /* Read the name, but also get it as a number if it's all digits */
         if a digit follows ( then there will just be digits until ) because  
         the syntax was checked in the first pass. */  
2921    
2922          else if ((digitab[ptr[1]] && ctype_digit) != 0)            name = ++ptr;
2923            {            while (*ptr != ')')
           int condref;                 /* Don't amalgamate; some compilers */  
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
2924              {              {
2925              *errorcodeptr = ERR35;              if (condref >= 0)
2926              goto FAILED;                condref = ((digitab[*ptr] & ctype_digit) != 0)?
2927                    condref * 10 + *ptr - '0' : -1;
2928                ptr++;
2929              }              }
2930              namelen = ptr - name;
2931            ptr++;            ptr++;
2932            code[1+LINK_SIZE] = OP_CREF;  
2933            PUT2(code, 2+LINK_SIZE, condref);            for (i = 0; i < cd->names_found; i++)
2934            skipbytes = 3;              {
2935                if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2936                slot += cd->name_entry_size;
2937                }
2938    
2939              /* Found a previous named subpattern */
2940    
2941              if (i < cd->names_found)
2942                {
2943                condref = GET2(slot, 0);
2944                code[1+LINK_SIZE] = OP_CREF;
2945                PUT2(code, 2+LINK_SIZE, condref);
2946                }
2947    
2948              /* Search the pattern for a forward reference */
2949    
2950              else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
2951                {
2952                code[1+LINK_SIZE] = OP_CREF;
2953                PUT2(code, 2+LINK_SIZE, i);
2954                }
2955    
2956              /* Check for 'R' for recursion */
2957    
2958              else if (namelen == 1 && *name == 'R')
2959                {
2960                code[1+LINK_SIZE] = OP_CREF;
2961                PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2962                }
2963    
2964              /* Check for a subpattern number */
2965    
2966              else if (condref > 0)
2967                {
2968                code[1+LINK_SIZE] = OP_CREF;
2969                PUT2(code, 2+LINK_SIZE, condref);
2970                }
2971    
2972              /* Either an unidentified subpattern, or a reference to (?(0) */
2973    
2974              else
2975                {
2976                *errorcodeptr = (condref == 0)? ERR35: ERR15;
2977                goto FAILED;
2978                }
2979            }            }
2980    
2981          /* For conditions that are assertions, we just fall through, having          /* For conditions that are assertions, we just fall through, having
2982          set bravalue above. */          set bravalue above. */
2983    
2984          break;          break;
2985    
2986          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
# Line 2953  for (;; ptr++) Line 3052  for (;; ptr++)
3052                {                {
3053                if (slot[2+namelen] == 0)                if (slot[2+namelen] == 0)
3054                  {                  {
3055                  *errorcodeptr = ERR43;                  if ((options & PCRE_DUPNAMES) == 0)
3056                  goto FAILED;                    {
3057                      *errorcodeptr = ERR43;
3058                      goto FAILED;
3059                      }
3060                  }                  }
3061                crc = -1;             /* Current name is substring */                else crc = -1;      /* Current name is substring */
3062                }                }
3063              if (crc < 0)              if (crc < 0)
3064                {                {
# Line 2989  for (;; ptr++) Line 3091  for (;; ptr++)
3091              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3092              slot += cd->name_entry_size;              slot += cd->name_entry_size;
3093              }              }
3094            if (i >= cd->names_found)  
3095              if (i < cd->names_found)         /* Back reference */
3096                {
3097                recno = GET2(slot, 0);
3098                }
3099              else if ((recno =                /* Forward back reference */
3100                        find_named_parens(ptr, *brackets, name, namelen)) <= 0)
3101              {              {
3102              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
3103              goto FAILED;              goto FAILED;
3104              }              }
3105    
           recno = GET2(slot, 0);  
   
3106            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
3107    
3108            /* Back reference */            /* Back reference */
# Line 3036  for (;; ptr++) Line 3142  for (;; ptr++)
3142            regex in case it doesn't exist. */            regex in case it doesn't exist. */
3143    
3144            *code = OP_END;            *code = OP_END;
3145            called = (recno == 0)?            called = (recno == 0)? cd->start_code :
3146              cd->start_code : find_bracket(cd->start_code, utf8, recno);              find_bracket(cd->start_code, utf8, recno);
   
3147            if (called == NULL)            if (called == NULL)
3148              {              {
3149              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
# Line 3085  for (;; ptr++) Line 3190  for (;; ptr++)
3190              case '-': optset = &unset; break;              case '-': optset = &unset; break;
3191    
3192              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
3193                case 'J': *optset |= PCRE_DUPNAMES; break;
3194              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
3195              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
3196              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
# Line 3201  for (;; ptr++) Line 3307  for (;; ptr++)
3307      else if (bravalue == OP_COND)      else if (bravalue == OP_COND)
3308        {        {
3309        uschar *tc = code;        uschar *tc = code;
3310        condcount = 0;        int condcount = 0;
3311    
3312        do {        do {
3313           condcount++;           condcount++;
# Line 3906  return pcre_compile2(pattern, options, N Line 4012  return pcre_compile2(pattern, options, N
4012  }  }
4013    
4014    
4015    
4016  PCRE_DATA_SCOPE pcre *  PCRE_DATA_SCOPE pcre *
4017  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
4018    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
4019  {  {
4020  real_pcre *re;  real_pcre *re;
4021  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
4022  int c, firstbyte, reqbyte;  int c, firstbyte, reqbyte, newline;
4023  int bracount = 0;  int bracount = 0;
4024  int branch_extra = 0;  int branch_extra = 0;
4025  int branch_newextra;  int branch_newextra;
# Line 3933  uschar *code; Line 4040  uschar *code;
4040  const uschar *codestart;  const uschar *codestart;
4041  const uschar *ptr;  const uschar *ptr;
4042  compile_data compile_block;  compile_data compile_block;
4043    compile_data *cd = &compile_block;
4044  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
4045  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
4046    
# Line 3986  if ((options & ~PUBLIC_OPTIONS) != 0) Line 4094  if ((options & ~PUBLIC_OPTIONS) != 0)
4094  /* Set up pointers to the individual character tables */  /* Set up pointers to the individual character tables */
4095    
4096  if (tables == NULL) tables = _pcre_default_tables;  if (tables == NULL) tables = _pcre_default_tables;
4097  compile_block.lcc = tables + lcc_offset;  cd->lcc = tables + lcc_offset;
4098  compile_block.fcc = tables + fcc_offset;  cd->fcc = tables + fcc_offset;
4099  compile_block.cbits = tables + cbits_offset;  cd->cbits = tables + cbits_offset;
4100  compile_block.ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
4101    
4102    /* Handle different types of newline. The two bits give four cases. The current
4103    code allows for one- or two-byte sequences. */
4104    
4105    switch (options & PCRE_NEWLINE_CRLF)
4106      {
4107      default:              newline = NEWLINE; break;   /* Compile-time default */
4108      case PCRE_NEWLINE_CR: newline = '\r'; break;
4109      case PCRE_NEWLINE_LF: newline = '\n'; break;
4110      case PCRE_NEWLINE_CR+
4111           PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4112      }
4113    
4114    if (newline > 255)
4115      {
4116      cd->nllen = 2;
4117      cd->nl[0] = (newline >> 8) & 255;
4118      cd->nl[1] = newline & 255;
4119      }
4120    else
4121      {
4122      cd->nllen = 1;
4123      cd->nl[0] = newline;
4124      }
4125    
4126  /* Maximum back reference and backref bitmap. This is updated for numeric  /* Maximum back reference and backref bitmap. This is updated for numeric
4127  references during the first pass, but for named references during the actual  references during the first pass, but for named references during the actual
4128  compile pass. The bitmap records up to 31 back references to help in deciding  compile pass. The bitmap records up to 31 back references to help in deciding
4129  whether (.*) can be treated as anchored or not. */  whether (.*) can be treated as anchored or not. */
4130    
4131  compile_block.top_backref = 0;  cd->top_backref = 0;
4132  compile_block.backref_map = 0;  cd->backref_map = 0;
4133    
4134  /* Reflect pattern for debugging output */  /* Reflect pattern for debugging output */
4135    
# Line 4031  while ((c = *(++ptr)) != 0) Line 4163  while ((c = *(++ptr)) != 0)
4163    
4164    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
4165      {      {
4166      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
4167      if (c == '#')      if (c == '#')
4168        {        {
4169        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
4170        on the Macintosh. */        if (*ptr != 0)
4171        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          {
4172        if (c == 0) break;          ptr += cd->nllen - 1;
4173        continue;          continue;
4174            }
4175          break;    /* End loop at end of pattern */
4176        }        }
4177      }      }
4178    
# Line 4128  while ((c = *(++ptr)) != 0) Line 4262  while ((c = *(++ptr)) != 0)
4262      if (c <= -ESC_REF)      if (c <= -ESC_REF)
4263        {        {
4264        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
4265        compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;        cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
4266        if (refnum > compile_block.top_backref)        if (refnum > cd->top_backref)
4267          compile_block.top_backref = refnum;          cd->top_backref = refnum;
4268        length += 2;   /* For single back reference */        length += 2;   /* For single back reference */
4269        if (ptr[1] == '{' && is_counted_repeat(ptr+2))        if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4270          {          {
# Line 4284  while ((c = *(++ptr)) != 0) Line 4418  while ((c = *(++ptr)) != 0)
4418        /* Check the syntax for POSIX stuff. The bits we actually handle are        /* Check the syntax for POSIX stuff. The bits we actually handle are
4419        checked during the real compile phase. */        checked during the real compile phase. */
4420    
4421        else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))        else if (*ptr == '[' &&
4422                    (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
4423                    check_posix_syntax(ptr, &ptr, cd))
4424          {          {
4425          ptr++;          ptr++;
4426          class_optcount = 10;    /* Make sure > 1 */          class_optcount = 10;    /* Make sure > 1 */
# Line 4517  while ((c = *(++ptr)) != 0) Line 4653  while ((c = *(++ptr)) != 0)
4653          ptr += 2;          ptr += 2;
4654          break;          break;
4655    
4656            /* Named subpatterns are an extension copied from Python */
4657    
4658            case 'P':
4659            ptr += 3;
4660    
4661            /* Handle the definition of a named subpattern */
4662    
4663            if (*ptr == '<')
4664              {
4665              const uschar *p;    /* Don't amalgamate; some compilers */
4666              p = ++ptr;          /* grumble at autoincrement in declaration */
4667              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4668              if (*ptr != '>')
4669                {
4670                errorcode = ERR42;
4671                goto PCRE_ERROR_RETURN;
4672                }
4673              name_count++;
4674              if (name_count > MAX_NAME_COUNT)
4675                {
4676                errorcode = ERR49;
4677                goto PCRE_ERROR_RETURN;
4678                }
4679              if (ptr - p > max_name_size)
4680                {
4681                max_name_size = (ptr - p);
4682                if (max_name_size > MAX_NAME_SIZE)
4683                  {
4684                  errorcode = ERR48;
4685                  goto PCRE_ERROR_RETURN;
4686                  }
4687                }
4688              capturing = TRUE;   /* Named parentheses are always capturing */
4689              break;              /* Go handle capturing parentheses */
4690              }
4691    
4692            /* Handle back references and recursive calls to named subpatterns */
4693    
4694            if (*ptr == '=' || *ptr == '>')
4695              {
4696              length += 3 + 3*LINK_SIZE;  /* Allow for the automatic "once" */
4697              while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
4698              if (*ptr != ')')
4699                {
4700                errorcode = ERR42;
4701                goto PCRE_ERROR_RETURN;
4702                }
4703              goto RECURSE_CHECK_QUANTIFIED;
4704              }
4705    
4706            /* Unknown character after (?P */
4707    
4708            errorcode = ERR41;
4709            goto PCRE_ERROR_RETURN;
4710    
4711          /* (?R) specifies a recursive call to the regex, which is an extension          /* (?R) specifies a recursive call to the regex, which is an extension
4712          to provide the facility which can be obtained by (?p{perl-code}) in          to provide the facility which can be obtained by (?p{perl-code}) in
4713          Perl 5.6. In Perl 5.8 this has become (??{perl-code}).          Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
# Line 4542  while ((c = *(++ptr)) != 0) Line 4733  while ((c = *(++ptr)) != 0)
4733    
4734          /* If this item is quantified, it will get wrapped inside brackets so          /* If this item is quantified, it will get wrapped inside brackets so
4735          as to use the code for quantified brackets. We jump down and use the          as to use the code for quantified brackets. We jump down and use the
4736          code that handles this for real brackets. */          code that handles this for real brackets. Come here from code for
4737            named recursions/subroutines. */
4738    
4739            RECURSE_CHECK_QUANTIFIED:
4740          if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')          if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4741            {            {
4742            length += 2 + 2 * LINK_SIZE;       /* to make bracketed */            length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
# Line 4567  while ((c = *(++ptr)) != 0) Line 4760  while ((c = *(++ptr)) != 0)
4760          length += 2 + 2*LINK_SIZE;          length += 2 + 2*LINK_SIZE;
4761          continue;          continue;
4762    
         /* Named subpatterns are an extension copied from Python */  
   
         case 'P':  
         ptr += 3;  
   
         /* Handle the definition of a named subpattern */  
   
         if (*ptr == '<')  
           {  
           const uschar *p;    /* Don't amalgamate; some compilers */  
           p = ++ptr;          /* grumble at autoincrement in declaration */  
           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;  
           if (*ptr != '>')  
             {  
             errorcode = ERR42;  
             goto PCRE_ERROR_RETURN;  
             }  
           name_count++;  
           if (ptr - p > max_name_size) max_name_size = (ptr - p);  
           capturing = TRUE;   /* Named parentheses are always capturing */  
           break;  
           }  
   
         /* Handle back references and recursive calls to named subpatterns */  
   
         if (*ptr == '=' || *ptr == '>')  
           {  
           length += 2 + 2*LINK_SIZE;  /* Allow for the automatic "once" */  
           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);  
           if (*ptr != ')')  
             {  
             errorcode = ERR42;  
             goto PCRE_ERROR_RETURN;  
             }  
           break;  
           }  
   
         /* Unknown character after (?P */  
   
         errorcode = ERR41;  
         goto PCRE_ERROR_RETURN;  
   
4763          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
4764    
4765          case '<':          case '<':
# Line 4624  while ((c = *(++ptr)) != 0) Line 4775  while ((c = *(++ptr)) != 0)
4775    
4776          /* Conditionals are in Perl from version 5.005. The bracket must either          /* Conditionals are in Perl from version 5.005. The bracket must either
4777          be followed by a number (for bracket reference) or by an assertion          be followed by a number (for bracket reference) or by an assertion
4778          group, or (a PCRE extension) by 'R' for a recursion test. */          group. PCRE extends this by allowing a name to reference a named group;
4779            unfortunately, previously 'R' was implemented for a recursion test.
4780            When this is compiled, we look for the named group 'R' first. At this
4781            point we just do a basic syntax check. */
4782    
4783          case '(':          case '(':
4784          if (ptr[3] == 'R' && ptr[4] == ')')          if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
           {  
           ptr += 4;  
           length += 3;  
           }  
         else if ((digitab[ptr[3]] & ctype_digit) != 0)  
4785            {            {
4786            ptr += 4;            ptr += 4;
4787            length += 3;            length += 3;
4788            while ((digitab[*ptr] & ctype_digit) != 0) ptr++;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4789            if (*ptr != ')')            if (*ptr != ')')
4790              {              {
4791              errorcode = ERR26;              errorcode = ERR26;
# Line 4675  while ((c = *(++ptr)) != 0) Line 4824  while ((c = *(++ptr)) != 0)
4824              *optset |= PCRE_CASELESS;              *optset |= PCRE_CASELESS;
4825              continue;              continue;
4826    
4827                case 'J':
4828                *optset |= PCRE_DUPNAMES;
4829                options |= PCRE_JCHANGED;   /* Record that it changed */
4830                continue;
4831    
4832              case 'm':              case 'm':
4833              *optset |= PCRE_MULTILINE;              *optset |= PCRE_MULTILINE;
4834              continue;              continue;
# Line 4740  while ((c = *(++ptr)) != 0) Line 4894  while ((c = *(++ptr)) != 0)
4894              will lead to an over-estimate on the length, but this shouldn't              will lead to an over-estimate on the length, but this shouldn't
4895              matter very much. We also have to allow for resetting options at              matter very much. We also have to allow for resetting options at
4896              the start of any alternations, which we do by setting              the start of any alternations, which we do by setting
4897              branch_newextra to 2. Finally, we record whether the case-dependent              branch_newextra to 2. */
             flag ever changes within the regex. This is used by the "required  
             character" code. */  
4898    
4899              case ':':              case ':':
4900              if (((set|unset) & PCRE_IMS) != 0)              if (((set|unset) & PCRE_IMS) != 0)
4901                {                {
4902                length += 4;                length += 4;
4903                branch_newextra = 2;                branch_newextra = 2;
               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;  
4904                }                }
4905              goto END_OPTIONS;              goto END_OPTIONS;
4906    
# Line 4829  while ((c = *(++ptr)) != 0) Line 4980  while ((c = *(++ptr)) != 0)
4980        {        {
4981        duplength = length - brastack[--brastackptr];        duplength = length - brastack[--brastackptr];
4982        branch_extra = bralenstack[brastackptr];        branch_extra = bralenstack[brastackptr];
4983          /* This is a paranoid check to stop integer overflow later on */
4984          if (duplength > MAX_DUPLENGTH)
4985            {
4986            errorcode = ERR50;
4987            goto PCRE_ERROR_RETURN;
4988            }
4989        }        }
4990      else duplength = 0;      else duplength = 0;
4991    
# Line 4933  if (length > MAX_PATTERN_SIZE) Line 5090  if (length > MAX_PATTERN_SIZE)
5090    }    }
5091    
5092  /* Compute the size of data block needed and get it, either from malloc or  /* Compute the size of data block needed and get it, either from malloc or
5093  externally provided function. */  externally provided function. Integer overflow should no longer be possible
5094    because nowadays we limit the maximum value of name_count and max_name size. */
5095    
5096  size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);  size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5097  re = (real_pcre *)(pcre_malloc)(size);  re = (real_pcre *)(pcre_malloc)(size);
# Line 4963  re->nullpad = NULL; Line 5121  re->nullpad = NULL;
5121  /* The starting points of the name/number translation table and of the code are  /* The starting points of the name/number translation table and of the code are
5122  passed around in the compile data block. */  passed around in the compile data block. */
5123    
5124  compile_block.names_found = 0;  cd->names_found = 0;
5125  compile_block.name_entry_size = max_name_size + 3;  cd->name_entry_size = max_name_size + 3;
5126  compile_block.name_table = (uschar *)re + re->name_table_offset;  cd->name_table = (uschar *)re + re->name_table_offset;
5127  codestart = compile_block.name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
5128  compile_block.start_code = codestart;  cd->start_code = codestart;
5129  compile_block.start_pattern = (const uschar *)pattern;  cd->start_pattern = (const uschar *)pattern;
5130  compile_block.req_varyopt = 0;  cd->req_varyopt = 0;
5131  compile_block.nopartial = FALSE;  cd->nopartial = FALSE;
5132    
5133  /* Set up a starting, non-extracting bracket, then compile the expression. On  /* Set up a starting, non-extracting bracket, then compile the expression. On
5134  error, errorcode will be set non-zero, so we don't need to look at the result  error, errorcode will be set non-zero, so we don't need to look at the result
# Line 4981  code = (uschar *)codestart; Line 5139  code = (uschar *)codestart;
5139  *code = OP_BRA;  *code = OP_BRA;
5140  bracount = 0;  bracount = 0;
5141  (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,  (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5142    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);    &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
5143  re->top_bracket = bracount;  re->top_bracket = bracount;
5144  re->top_backref = compile_block.top_backref;  re->top_backref = cd->top_backref;
5145    
5146  if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;  if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5147    
5148  /* If not reached end of pattern on success, there's an excess bracket. */  /* If not reached end of pattern on success, there's an excess bracket. */
5149    
# Line 5031  start with ^. and also when all branches Line 5189  start with ^. and also when all branches
5189  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
5190    {    {
5191    int temp_options = options;    int temp_options = options;
5192    if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))    if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5193      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
5194    else    else
5195      {      {
# Line 5041  if ((options & PCRE_ANCHORED) == 0) Line 5199  if ((options & PCRE_ANCHORED) == 0)
5199        {        {
5200        int ch = firstbyte & 255;        int ch = firstbyte & 255;
5201        re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&        re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5202           compile_block.fcc[ch] == ch)? ch : firstbyte;           cd->fcc[ch] == ch)? ch : firstbyte;
5203        re->options |= PCRE_FIRSTSET;        re->options |= PCRE_FIRSTSET;
5204        }        }
5205      else if (is_startline(codestart, 0, compile_block.backref_map))      else if (is_startline(codestart, 0, cd->backref_map))
5206        re->options |= PCRE_STARTLINE;        re->options |= PCRE_STARTLINE;
5207      }      }
5208    }    }
# Line 5058  if (reqbyte >= 0 && Line 5216  if (reqbyte >= 0 &&
5216    {    {
5217    int ch = reqbyte & 255;    int ch = reqbyte & 255;
5218    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&    re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5219      compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;      cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5220    re->options |= PCRE_REQCHSET;    re->options |= PCRE_REQCHSET;
5221    }    }
5222    
# Line 5072  printf("Length = %d top_bracket = %d top Line 5230  printf("Length = %d top_bracket = %d top
5230    
5231  if (re->options != 0)  if (re->options != 0)
5232    {    {
5233    printf("%s%s%s%s%s%s%s%s%s%s\n",    printf("%s%s%s%s%s%s%s%s%s\n",
5234      ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",      ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5235      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",      ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5236      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",      ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",  
5237      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",      ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5238      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",      ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5239      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",      ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",

Legend:
Removed from v.90  
changed lines
  Added in v.91

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12