/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 42 by nigel, Sat Feb 24 21:39:13 2007 UTC revision 43 by nigel, Sat Feb 24 21:39:21 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2000 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 82  static const char *OP_names[] = { Line 82  static const char *OP_names[] = {
82    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
85    "class", "Ref",    "class", "Ref", "Recurse",
86    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 107  static const short int escapes[] = { Line 107  static const short int escapes[] = {
107      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
108  };  };
109    
110    /* Tables of names of POSIX character classes and their lengths. The list is
111    terminated by a zero length entry. The first three must be alpha, upper, lower,
112    as this is assumed for handling case independence. */
113    
114    static const char *posix_names[] = {
115      "alpha", "lower", "upper",
116      "alnum", "ascii", "cntrl", "digit", "graph",
117      "print", "punct", "space", "word",  "xdigit" };
118    
119    static const uschar posix_name_lengths[] = {
120      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122    /* Table of class bit maps for each POSIX class; up to three may be combined
123    to form the class. */
124    
125    static const int posix_class_maps[] = {
126      cbit_lower, cbit_upper, -1,             /* alpha */
127      cbit_lower, -1,         -1,             /* lower */
128      cbit_upper, -1,         -1,             /* upper */
129      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130      cbit_print, cbit_cntrl, -1,             /* ascii */
131      cbit_cntrl, -1,         -1,             /* cntrl */
132      cbit_digit, -1,         -1,             /* digit */
133      cbit_graph, -1,         -1,             /* graph */
134      cbit_print, -1,         -1,             /* print */
135      cbit_punct, -1,         -1,             /* punct */
136      cbit_space, -1,         -1,             /* space */
137      cbit_word,  -1,         -1,             /* word */
138      cbit_xdigit,-1,         -1              /* xdigit */
139    };
140    
141    
142  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
143    
144  static BOOL  static BOOL
# Line 161  return XSTRING(PCRE_MAJOR) "." XSTRING(P Line 193  return XSTRING(PCRE_MAJOR) "." XSTRING(P
193    
194    
195  /*************************************************  /*************************************************
196  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
197  *************************************************/  *************************************************/
198    
199  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
200  structure. The public options are passed back in an int - though the  of the private structure, but its interface was too rigid. It remains for
201  re->options field has been expanded to a long int, all the public options  backwards compatibility. The public options are passed back in an int - though
202    the re->options field has been expanded to a long int, all the public options
203  at the low end of it, and so even on 16-bit systems this will still be OK.  at the low end of it, and so even on 16-bit systems this will still be OK.
204  Therefore, I haven't changed the API for pcre_info().  Therefore, I haven't changed the API for pcre_info().
205    
# Line 177  Arguments: Line 210  Arguments:
210                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
211                  or -2 otherwise                  or -2 otherwise
212    
213  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
214                  or negative values on error                  or negative values on error
215  */  */
216    
# Line 196  return re->top_bracket; Line 229  return re->top_bracket;
229    
230    
231    
232    /*************************************************
233    *        Return info about compiled pattern      *
234    *************************************************/
235    
236    /* This is a newer "info" function which has an extensible interface so
237    that additional items can be added compatibly.
238    
239    Arguments:
240      external_re      points to compiled code
241      external_study   points to study data, or NULL
242      what             what information is required
243      where            where to put the information
244    
245    Returns:           0 if data returned, negative on error
246    */
247    
248    int
249    pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
250      void *where)
251    {
252    const real_pcre *re = (const real_pcre *)external_re;
253    const real_pcre_extra *study = (const real_pcre_extra *)study_data;
254    
255    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
257    
258    switch (what)
259      {
260      case PCRE_INFO_OPTIONS:
261      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
262      break;
263    
264      case PCRE_INFO_SIZE:
265      *((size_t *)where) = re->size;
266      break;
267    
268      case PCRE_INFO_CAPTURECOUNT:
269      *((int *)where) = re->top_bracket;
270      break;
271    
272      case PCRE_INFO_BACKREFMAX:
273      *((int *)where) = re->top_backref;
274      break;
275    
276      case PCRE_INFO_FIRSTCHAR:
277      *((int *)where) =
278        ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
280      break;
281    
282      case PCRE_INFO_FIRSTTABLE:
283      *((const uschar **)where) =
284        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285          study->start_bits : NULL;
286      break;
287    
288      case PCRE_INFO_LASTLITERAL:
289      *((int *)where) =
290        ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
291      break;
292    
293      default: return PCRE_ERROR_BADOPTION;
294      }
295    
296    return 0;
297    }
298    
299    
300    
301  #ifdef DEBUG  #ifdef DEBUG
302  /*************************************************  /*************************************************
# Line 255  check_escape(const uschar **ptrptr, cons Line 356  check_escape(const uschar **ptrptr, cons
356    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
357  {  {
358  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
359  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
360    
361    c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
362  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
363    
364  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 622  for (;;) Line 723  for (;;)
723    
724    
725  /*************************************************  /*************************************************
726    *           Check for POSIX class syntax         *
727    *************************************************/
728    
729    /* This function is called when the sequence "[:" or "[." or "[=" is
730    encountered in a character class. It checks whether this is followed by an
731    optional ^ and then a sequence of letters, terminated by a matching ":]" or
732    ".]" or "=]".
733    
734    Argument:
735      ptr      pointer to the initial [
736      endptr   where to return the end pointer
737      cd       pointer to compile data
738    
739    Returns:   TRUE or FALSE
740    */
741    
742    static BOOL
743    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
744    {
745    int terminator;          /* Don't combine these lines; the Solaris cc */
746    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
747    if (*(++ptr) == '^') ptr++;
748    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749    if (*ptr == terminator && ptr[1] == ']')
750      {
751      *endptr = ptr;
752      return TRUE;
753      }
754    return FALSE;
755    }
756    
757    
758    
759    
760    /*************************************************
761    *          Check POSIX class name                *
762    *************************************************/
763    
764    /* This function is called to check the name given in a POSIX-style class entry
765    such as [:alnum:].
766    
767    Arguments:
768      ptr        points to the first letter
769      len        the length of the name
770    
771    Returns:     a value representing the name, or -1 if unknown
772    */
773    
774    static int
775    check_posix_name(const uschar *ptr, int len)
776    {
777    register int yield = 0;
778    while (posix_name_lengths[yield] != 0)
779      {
780      if (len == posix_name_lengths[yield] &&
781        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
782      yield++;
783      }
784    return -1;
785    }
786    
787    
788    
789    
790    /*************************************************
791  *           Compile one branch                   *  *           Compile one branch                   *
792  *************************************************/  *************************************************/
793    
# Line 764  for (;; ptr++) Line 930  for (;; ptr++)
930          goto FAILED;          goto FAILED;
931          }          }
932    
933          /* Handle POSIX class names. Perl allows a negation extension of the
934          form [:^name]. A square bracket that doesn't match the syntax is
935          treated as a literal. We also recognize the POSIX constructions
936          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
937          5.6 does. */
938    
939          if (c == '[' &&
940              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941              check_posix_syntax(ptr, &tempptr, cd))
942            {
943            BOOL local_negate = FALSE;
944            int posix_class, i;
945            register const uschar *cbits = cd->cbits;
946    
947            if (ptr[1] != ':')
948              {
949              *errorptr = ERR31;
950              goto FAILED;
951              }
952    
953            ptr += 2;
954            if (*ptr == '^')
955              {
956              local_negate = TRUE;
957              ptr++;
958              }
959    
960            posix_class = check_posix_name(ptr, tempptr - ptr);
961            if (posix_class < 0)
962              {
963              *errorptr = ERR30;
964              goto FAILED;
965              }
966    
967            /* If matching is caseless, upper and lower are converted to
968            alpha. This relies on the fact that the class table starts with
969            alpha, lower, upper as the first 3 entries. */
970    
971            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
972              posix_class = 0;
973    
974            /* Or into the map we are building up to 3 of the static class
975            tables, or their negations. */
976    
977            posix_class *= 3;
978            for (i = 0; i < 3; i++)
979              {
980              int taboffset = posix_class_maps[posix_class + i];
981              if (taboffset < 0) break;
982              if (local_negate)
983                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
984              else
985                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
986              }
987    
988            ptr = tempptr + 1;
989            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
990            continue;
991            }
992    
993        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
994        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. Escaped items are checked for
995        validity in the pre-compiling pass. The sequence \b is a special case.        validity in the pre-compiling pass. The sequence \b is a special case.
# Line 791  for (;; ptr++) Line 1017  for (;; ptr++)
1017              continue;              continue;
1018    
1019              case ESC_w:              case ESC_w:
1020              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1021              continue;              continue;
1022    
1023              case ESC_W:              case ESC_W:
1024              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1025              continue;              continue;
1026    
1027              case ESC_s:              case ESC_s:
# Line 1360  for (;; ptr++) Line 1584  for (;; ptr++)
1584          ptr++;          ptr++;
1585          break;          break;
1586    
1587            case 'R':                 /* Pattern recursion */
1588            *code++ = OP_RECURSE;
1589            ptr++;
1590            continue;
1591    
1592          default:                  /* Option setting */          default:                  /* Option setting */
1593          set = unset = 0;          set = unset = 0;
1594          optset = &set;          optset = &set;
# Line 2015  pcre_compile(const char *pattern, int op Line 2244  pcre_compile(const char *pattern, int op
2244  real_pcre *re;  real_pcre *re;
2245  int length = 3;      /* For initial BRA plus length */  int length = 3;      /* For initial BRA plus length */
2246  int runlength;  int runlength;
2247  int c, size, reqchar, countlits;  int c, reqchar, countlits;
2248  int bracount = 0;  int bracount = 0;
2249  int top_backref = 0;  int top_backref = 0;
2250  int branch_extra = 0;  int branch_extra = 0;
2251  int branch_newextra;  int branch_newextra;
2252  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
2253    size_t size;
2254  uschar *code;  uschar *code;
2255  const uschar *ptr;  const uschar *ptr;
2256  compile_data compile_block;  compile_data compile_block;
# Line 2248  while ((c = *(++ptr)) != 0) Line 2478  while ((c = *(++ptr)) != 0)
2478          ptr += 2;          ptr += 2;
2479          break;          break;
2480    
2481            /* A recursive call to the regex is an extension, to provide the
2482            facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2483    
2484            case 'R':
2485            if (ptr[3] != ')')
2486              {
2487              *errorptr = ERR29;
2488              goto PCRE_ERROR_RETURN;
2489              }
2490            ptr += 3;
2491            length += 1;
2492            break;
2493    
2494          /* Lookbehinds are in Perl from version 5.005 */          /* Lookbehinds are in Perl from version 5.005 */
2495    
2496          case '<':          case '<':
# Line 2550  if (re == NULL) Line 2793  if (re == NULL)
2793    return NULL;    return NULL;
2794    }    }
2795    
2796  /* Put in the magic number and the options. */  /* Put in the magic number, and save the size, options, and table pointer */
2797    
2798  re->magic_number = MAGIC_NUMBER;  re->magic_number = MAGIC_NUMBER;
2799    re->size = size;
2800  re->options = options;  re->options = options;
2801  re->tables = tables;  re->tables = tables;
2802    
# Line 3147  for (;;) Line 3391  for (;;)
3391      ecode += 3;      ecode += 3;
3392      break;      break;
3393    
3394        /* Recursion matches the current regex, nested. If there are any capturing
3395        brackets started but not finished, we have to save their starting points
3396        and reinstate them after the recursion. However, we don't know how many
3397        such there are (offset_top records the completed total) so we just have
3398        to save all the potential data. There may be up to 99 such values, which
3399        is a bit large to put on the stack, but using malloc for small numbers
3400        seems expensive. As a compromise, the stack is used when there are fewer
3401        than 16 values to store; otherwise malloc is used. A problem is what to do
3402        if the malloc fails ... there is no way of returning to the top level with
3403        an error. Save the top 15 values on the stack, and accept that the rest
3404        may be wrong. */
3405    
3406        case OP_RECURSE:
3407          {
3408          BOOL rc;
3409          int *save;
3410          int stacksave[15];
3411    
3412          c = md->offset_max;
3413    
3414          if (c < 16) save = stacksave; else
3415            {
3416            save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3417            if (save == NULL)
3418              {
3419              save = stacksave;
3420              c = 15;
3421              }
3422            }
3423    
3424          for (i = 1; i <= c; i++)
3425            save[i] = md->offset_vector[md->offset_end - i];
3426          rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427          for (i = 1; i <= c; i++)
3428            md->offset_vector[md->offset_end - i] = save[i];
3429          if (save != stacksave) (pcre_free)(save);
3430          if (!rc) return FALSE;
3431    
3432          /* In case the recursion has set more capturing values, save the final
3433          number, then move along the subject till after the recursive match,
3434          and advance one byte in the pattern code. */
3435    
3436          offset_top = md->end_offset_top;
3437          eptr = md->end_match_ptr;
3438          ecode++;
3439          }
3440        break;
3441    
3442      /* "Once" brackets are like assertion brackets except that after a match,      /* "Once" brackets are like assertion brackets except that after a match,
3443      the point in the subject string is not moved back. Thus there can never be      the point in the subject string is not moved back. Thus there can never be
# Line 4216  if (re == NULL || subject == NULL || Line 4507  if (re == NULL || subject == NULL ||
4507     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4508  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4509    
4510    match_block.start_pattern = re->code;
4511  match_block.start_subject = (const uschar *)subject;  match_block.start_subject = (const uschar *)subject;
4512  match_block.end_subject = match_block.start_subject + length;  match_block.end_subject = match_block.start_subject + length;
4513  end_subject = match_block.end_subject;  end_subject = match_block.end_subject;

Legend:
Removed from v.42  
changed lines
  Added in v.43

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12