/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 107 by ph10, Wed Mar 7 11:02:28 2007 UTC revision 171 by ph10, Mon Jun 4 14:28:58 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 87  static const short int escapes[] = { Line 87  static const short int escapes[] = {
87       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
88       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
90       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
91  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
92  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
93     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
# Line 208  static const char *error_texts[] = { Line 208  static const char *error_texts[] = {
208    "malformed number or name after (?(",    "malformed number or name after (?(",
209    "conditional group contains more than two branches",    "conditional group contains more than two branches",
210    "assertion expected after (?(",    "assertion expected after (?(",
211    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
212    /* 30 */    /* 30 */
213    "unknown POSIX class name",    "unknown POSIX class name",
214    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
# Line 242  static const char *error_texts[] = { Line 242  static const char *error_texts[] = {
242    /* 55 */    /* 55 */
243    "repeating a DEFINE group is not allowed",    "repeating a DEFINE group is not allowed",
244    "inconsistent NEWLINE options",    "inconsistent NEWLINE options",
245    "\\g is not followed by an (optionally braced) non-zero number"    "\\g is not followed by a braced name or an optionally braced non-zero number",
246      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247  };  };
248    
249    
# Line 452  else Line 453  else
453    
454      /* \g must be followed by a number, either plain or braced. If positive, it      /* \g must be followed by a number, either plain or braced. If positive, it
455      is an absolute backreference. If negative, it is a relative backreference.      is an absolute backreference. If negative, it is a relative backreference.
456      This is a Perl 5.10 feature. */      This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457        reference to a named group. This is part of Perl's movement towards a
458        unified syntax for back references. As this is synonymous with \k{name}, we
459        fudge it up by pretending it really was \k. */
460    
461      case 'g':      case 'g':
462      if (ptr[1] == '{')      if (ptr[1] == '{')
463        {        {
464          const uschar *p;
465          for (p = ptr+2; *p != 0 && *p != '}'; p++)
466            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467          if (*p != 0 && *p != '}')
468            {
469            c = -ESC_k;
470            break;
471            }
472        braced = TRUE;        braced = TRUE;
473        ptr++;        ptr++;
474        }        }
# Line 1267  for (;;) Line 1279  for (;;)
1279        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280        break;        break;
1281        }        }
1282  #endif  #endif
1283      }      }
1284    }    }
1285  }  }
# Line 1332  for (;;) Line 1344  for (;;)
1344        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345        break;        break;
1346        }        }
1347  #endif  #endif
1348      }      }
1349    }    }
1350  }  }
# Line 1369  for (code = first_significant_code(code Line 1381  for (code = first_significant_code(code
1381    const uschar *ccode;    const uschar *ccode;
1382    
1383    c = *code;    c = *code;
1384    
1385      /* Groups with zero repeats can of course be empty; skip them. */
1386    
1387      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388        {
1389        do code += GET(code, 1); while (*code == OP_ALT);
1390        c = *code;
1391        continue;
1392        }
1393    
1394      /* For other groups, scan the branches. */
1395    
1396    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1397      {      {
1398      BOOL empty_branch;      BOOL empty_branch;
# Line 1386  for (code = first_significant_code(code Line 1409  for (code = first_significant_code(code
1409        }        }
1410      while (*code == OP_ALT);      while (*code == OP_ALT);
1411      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1412        c = *code;
     /* Move past the KET and fudge things so that the increment in the "for"  
     above has no effect. */  
   
     c = OP_END;  
     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];  
1413      continue;      continue;
1414      }      }
1415    
# Line 2095  for (;; ptr++) Line 2113  for (;; ptr++)
2113    int class_lastchar;    int class_lastchar;
2114    int newoptions;    int newoptions;
2115    int recno;    int recno;
2116      int refsign;
2117    int skipbytes;    int skipbytes;
2118    int subreqbyte;    int subreqbyte;
2119    int subfirstbyte;    int subfirstbyte;
# Line 3621  for (;; ptr++) Line 3640  for (;; ptr++)
3640    
3641          code[1+LINK_SIZE] = OP_CREF;          code[1+LINK_SIZE] = OP_CREF;
3642          skipbytes = 3;          skipbytes = 3;
3643            refsign = -1;
3644    
3645          /* Check for a test for recursion in a named group. */          /* Check for a test for recursion in a named group. */
3646    
# Line 3644  for (;; ptr++) Line 3664  for (;; ptr++)
3664            terminator = '\'';            terminator = '\'';
3665            ptr++;            ptr++;
3666            }            }
3667          else terminator = 0;          else
3668              {
3669              terminator = 0;
3670              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3671              }
3672    
3673          /* We now expect to read a name; any thing else is an error */          /* We now expect to read a name; any thing else is an error */
3674    
# Line 3680  for (;; ptr++) Line 3704  for (;; ptr++)
3704          if (lengthptr != NULL) break;          if (lengthptr != NULL) break;
3705    
3706          /* In the real compile we do the work of looking for the actual          /* In the real compile we do the work of looking for the actual
3707          reference. */          reference. If the string started with "+" or "-" we require the rest to
3708            be digits, in which case recno will be set. */
3709    
3710            if (refsign > 0)
3711              {
3712              if (recno <= 0)
3713                {
3714                *errorcodeptr = ERR58;
3715                goto FAILED;
3716                }
3717              if (refsign == '-')
3718                {
3719                recno = cd->bracount - recno + 1;
3720                if (recno <= 0)
3721                  {
3722                  *errorcodeptr = ERR15;
3723                  goto FAILED;
3724                  }
3725                }
3726              else recno += cd->bracount;
3727              PUT2(code, 2+LINK_SIZE, recno);
3728              break;
3729              }
3730    
3731            /* Otherwise (did not start with "+" or "-"), start by looking for the
3732            name. */
3733    
3734          slot = cd->name_table;          slot = cd->name_table;
3735          for (i = 0; i < cd->names_found; i++)          for (i = 0; i < cd->names_found; i++)
3736            {            {
# Line 3999  for (;; ptr++) Line 4048  for (;; ptr++)
4048    
4049    
4050          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4051            case '-': case '+':
4052          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */          case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4053          case '5': case '6': case '7': case '8': case '9':   /* subroutine */          case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4054            {            {
4055            const uschar *called;            const uschar *called;
4056    
4057              if ((refsign = *ptr) == '+') ptr++;
4058              else if (refsign == '-')
4059                {
4060                if ((digitab[ptr[1]] & ctype_digit) == 0)
4061                  goto OTHER_CHAR_AFTER_QUERY;
4062                ptr++;
4063                }
4064    
4065            recno = 0;            recno = 0;
4066            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4067              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4068    
4069            if (*ptr != ')')            if (*ptr != ')')
4070              {              {
4071              *errorcodeptr = ERR29;              *errorcodeptr = ERR29;
4072              goto FAILED;              goto FAILED;
4073              }              }
4074    
4075              if (refsign == '-')
4076                {
4077                if (recno == 0)
4078                  {
4079                  *errorcodeptr = ERR58;
4080                  goto FAILED;
4081                  }
4082                recno = cd->bracount - recno + 1;
4083                if (recno <= 0)
4084                  {
4085                  *errorcodeptr = ERR15;
4086                  goto FAILED;
4087                  }
4088                }
4089              else if (refsign == '+')
4090                {
4091                if (recno == 0)
4092                  {
4093                  *errorcodeptr = ERR58;
4094                  goto FAILED;
4095                  }
4096                recno += cd->bracount;
4097                }
4098    
4099            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4100    
# Line 4084  for (;; ptr++) Line 4168  for (;; ptr++)
4168    
4169          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
4170          default:              /* Other characters: check option setting */          default:              /* Other characters: check option setting */
4171            OTHER_CHAR_AFTER_QUERY:
4172          set = unset = 0;          set = unset = 0;
4173          optset = &set;          optset = &set;
4174    
# Line 4396  for (;; ptr++) Line 4481  for (;; ptr++)
4481        zerofirstbyte = firstbyte;        zerofirstbyte = firstbyte;
4482        zeroreqbyte = reqbyte;        zeroreqbyte = reqbyte;
4483    
4484        /* \k<name> or \k'name' is a back reference by name (Perl syntax) */        /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4485          We also support \k{name} (.NET syntax) */
4486    
4487        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))        if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4488          {          {
4489          is_recurse = FALSE;          is_recurse = FALSE;
4490          terminator = (*(++ptr) == '<')? '>' : '\'';          terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4491          goto NAMED_REF_OR_RECURSE;          goto NAMED_REF_OR_RECURSE;
4492          }          }
4493    
# Line 5043  Returns: pointer to compiled data Line 5129  Returns: pointer to compiled data
5129                  with errorptr and erroroffset set                  with errorptr and erroroffset set
5130  */  */
5131    
5132  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5133  pcre_compile(const char *pattern, int options, const char **errorptr,  pcre_compile(const char *pattern, int options, const char **errorptr,
5134    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
5135  {  {
# Line 5051  return pcre_compile2(pattern, options, N Line 5137  return pcre_compile2(pattern, options, N
5137  }  }
5138    
5139    
5140  PCRE_DATA_SCOPE pcre *  PCRE_EXP_DEFN pcre *
5141  pcre_compile2(const char *pattern, int options, int *errorcodeptr,  pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5142    const char **errorptr, int *erroroffset, const unsigned char *tables)    const char **errorptr, int *erroroffset, const unsigned char *tables)
5143  {  {
# Line 5100  if (errorcodeptr != NULL) *errorcodeptr Line 5186  if (errorcodeptr != NULL) *errorcodeptr
5186  if (erroroffset == NULL)  if (erroroffset == NULL)
5187    {    {
5188    errorcode = ERR16;    errorcode = ERR16;
5189    goto PCRE_EARLY_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5190    }    }
5191    
5192  *erroroffset = 0;  *erroroffset = 0;
# Line 5113  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 5199  if (utf8 && (options & PCRE_NO_UTF8_CHEC
5199       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)       (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5200    {    {
5201    errorcode = ERR44;    errorcode = ERR44;
5202    goto PCRE_UTF8_ERROR_RETURN;    goto PCRE_EARLY_ERROR_RETURN2;
5203    }    }
5204  #else  #else
5205  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
# Line 5138  cd->cbits = tables + cbits_offset; Line 5224  cd->cbits = tables + cbits_offset;
5224  cd->ctypes = tables + ctypes_offset;  cd->ctypes = tables + ctypes_offset;
5225    
5226  /* Handle different types of newline. The three bits give seven cases. The  /* Handle different types of newline. The three bits give seven cases. The
5227  current code allows for fixed one- or two-byte sequences, plus "any". */  current code allows for fixed one- or two-byte sequences, plus "any" and
5228    "anycrlf". */
5229    
5230  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))  switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5231    {    {
# Line 5148  switch (options & (PCRE_NEWLINE_CRLF | P Line 5235  switch (options & (PCRE_NEWLINE_CRLF | P
5235    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
5236         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5237    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
5238      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5239    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;    default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5240    }    }
5241    
5242  if (newline < 0)  if (newline == -2)
5243      {
5244      cd->nltype = NLTYPE_ANYCRLF;
5245      }
5246    else if (newline < 0)
5247    {    {
5248    cd->nltype = NLTYPE_ANY;    cd->nltype = NLTYPE_ANY;
5249    }    }
# Line 5325  if (errorcode != 0) Line 5417  if (errorcode != 0)
5417    (pcre_free)(re);    (pcre_free)(re);
5418    PCRE_EARLY_ERROR_RETURN:    PCRE_EARLY_ERROR_RETURN:
5419    *erroroffset = ptr - (const uschar *)pattern;    *erroroffset = ptr - (const uschar *)pattern;
5420  #ifdef SUPPORT_UTF8    PCRE_EARLY_ERROR_RETURN2:
   PCRE_UTF8_ERROR_RETURN:  
 #endif  
5421    *errorptr = error_texts[errorcode];    *errorptr = error_texts[errorcode];
5422    if (errorcodeptr != NULL) *errorcodeptr = errorcode;    if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5423    return NULL;    return NULL;
# Line 5417  if ((re->options & PCRE_REQCHSET) != 0) Line 5507  if ((re->options & PCRE_REQCHSET) != 0)
5507      else printf("Req char = \\x%02x%s\n", ch, caseless);      else printf("Req char = \\x%02x%s\n", ch, caseless);
5508    }    }
5509    
5510  pcre_printint(re, stdout);  pcre_printint(re, stdout, TRUE);
5511    
5512  /* This check is done here in the debugging case so that the code that  /* This check is done here in the debugging case so that the code that
5513  was compiled can be seen. */  was compiled can be seen. */

Legend:
Removed from v.107  
changed lines
  Added in v.171

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12