/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 69 by nigel, Sat Feb 24 21:40:18 2007 UTC revision 71 by nigel, Sat Feb 24 21:40:24 2007 UTC
# Line 241  changed by the caller, but are shared be Line 241  changed by the caller, but are shared be
241  compiling for Virtual Pascal, things are done differently (see pcre.in). */  compiling for Virtual Pascal, things are done differently (see pcre.in). */
242    
243  #ifndef VPCOMPAT  #ifndef VPCOMPAT
244    #ifdef __cplusplus
245    extern "C" void *(*pcre_malloc)(size_t) = malloc;
246    extern "C" void  (*pcre_free)(void *) = free;
247    extern "C" int   (*pcre_callout)(pcre_callout_block *) = NULL;
248    #else
249  void *(*pcre_malloc)(size_t) = malloc;  void *(*pcre_malloc)(size_t) = malloc;
250  void  (*pcre_free)(void *) = free;  void  (*pcre_free)(void *) = free;
251  int   (*pcre_callout)(pcre_callout_block *) = NULL;  int   (*pcre_callout)(pcre_callout_block *) = NULL;
252  #endif  #endif
253    #endif
254    
255    
256  /*************************************************  /*************************************************
# Line 511  if (re == NULL || where == NULL) return Line 517  if (re == NULL || where == NULL) return
517  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
518    
519  if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)  if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
520    study = extra_data->study_data;    study = (const pcre_study_data *)extra_data->study_data;
521    
522  switch (what)  switch (what)
523    {    {
# Line 592  pcre_config(int what, void *where) Line 598  pcre_config(int what, void *where)
598  switch (what)  switch (what)
599    {    {
600    case PCRE_CONFIG_UTF8:    case PCRE_CONFIG_UTF8:
601    #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
602    *((int *)where) = 1;    *((int *)where) = 1;
603    #else  #else
604    *((int *)where) = 0;    *((int *)where) = 0;
605    #endif  #endif
606    break;    break;
607    
608    case PCRE_CONFIG_NEWLINE:    case PCRE_CONFIG_NEWLINE:
# Line 669  Arguments: Line 675  Arguments:
675    bracount   number of previous extracting brackets    bracount   number of previous extracting brackets
676    options    the options bits    options    the options bits
677    isclass    TRUE if inside a character class    isclass    TRUE if inside a character class
   cd         pointer to char tables block  
678    
679  Returns:     zero or positive => a data character  Returns:     zero or positive => a data character
680               negative => a special escape sequence               negative => a special escape sequence
# Line 678  Returns: zero or positive => a data Line 683  Returns: zero or positive => a data
683    
684  static int  static int
685  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,  check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
686    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass)
687  {  {
688  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
689  int c, i;  int c, i;
# Line 801  else Line 806  else
806      c = 0;      c = 0;
807      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
808        {        {
809        int cc = *(++ptr);        int cc;                               /* Some compilers don't like ++ */
810          cc = *(++ptr);                        /* in initializers */
811        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
812        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
813        }        }
# Line 858  where the ddds are digits. Line 864  where the ddds are digits.
864    
865  Arguments:  Arguments:
866    p         pointer to the first char after '{'    p         pointer to the first char after '{'
   cd        pointer to char tables block  
867    
868  Returns:    TRUE or FALSE  Returns:    TRUE or FALSE
869  */  */
870    
871  static BOOL  static BOOL
872  is_counted_repeat(const uschar *p, compile_data *cd)  is_counted_repeat(const uschar *p)
873  {  {
874  if ((digitab[*p++] && ctype_digit) == 0) return FALSE;  if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
875  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
# Line 895  Arguments: Line 900  Arguments:
900    maxp       pointer to int for max    maxp       pointer to int for max
901               returned as -1 if no max               returned as -1 if no max
902    errorptr   points to pointer to error message    errorptr   points to pointer to error message
   cd         pointer to character tables clock  
903    
904  Returns:     pointer to '}' on success;  Returns:     pointer to '}' on success;
905               current ptr on error, with errorptr set               current ptr on error, with errorptr set
906  */  */
907    
908  static const uschar *  static const uschar *
909  read_repeat_counts(const uschar *p, int *minp, int *maxp,  read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
   const char **errorptr, compile_data *cd)  
910  {  {
911  int min = 0;  int min = 0;
912  int max = -1;  int max = -1;
# Line 1793  for (;; ptr++) Line 1796  for (;; ptr++)
1796    
1797        if (c == '\\')        if (c == '\\')
1798          {          {
1799          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1800          if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1801    
1802          if (-c == ESC_Q)            /* Handle start of quoted string */          if (-c == ESC_Q)            /* Handle start of quoted string */
# Line 1882  for (;; ptr++) Line 1885  for (;; ptr++)
1885          if (d == '\\')          if (d == '\\')
1886            {            {
1887            const uschar *oldptr = ptr;            const uschar *oldptr = ptr;
1888            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1889    
1890            /* \b is backslash; any other special means the '-' was literal */            /* \b is backslash; any other special means the '-' was literal */
1891    
# Line 2091  for (;; ptr++) Line 2094  for (;; ptr++)
2094      /* Various kinds of repeat */      /* Various kinds of repeat */
2095    
2096      case '{':      case '{':
2097      if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2098      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2099      if (*errorptr != NULL) goto FAILED;      if (*errorptr != NULL) goto FAILED;
2100      goto REPEAT;      goto REPEAT;
2101    
# Line 3039  for (;; ptr++) Line 3042  for (;; ptr++)
3042    
3043      case '\\':      case '\\':
3044      tempptr = ptr;      tempptr = ptr;
3045      c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);      c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3046    
3047      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values      /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3048      are arranged to be the negation of the corresponding OP_values. For the      are arranged to be the negation of the corresponding OP_values. For the
# Line 3142  for (;; ptr++) Line 3145  for (;; ptr++)
3145        if (c == '\\')        if (c == '\\')
3146          {          {
3147          tempptr = ptr;          tempptr = ptr;
3148          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3149          if (c < 0) { ptr = tempptr; break; }          if (c < 0) { ptr = tempptr; break; }
3150    
3151          /* If a character is > 127 in UTF-8 mode, we have to turn it into          /* If a character is > 127 in UTF-8 mode, we have to turn it into
# Line 3727  return c; Line 3730  return c;
3730    
3731    
3732    
3733    #ifdef SUPPORT_UTF8
3734    /*************************************************
3735    *         Validate a UTF-8 string                *
3736    *************************************************/
3737    
3738    /* This function is called (optionally) at the start of compile or match, to
3739    validate that a supposed UTF-8 string is actually valid. The early check means
3740    that subsequent code can assume it is dealing with a valid string. The check
3741    can be turned off for maximum performance, but then consequences of supplying
3742    an invalid string are then undefined.
3743    
3744    Arguments:
3745      string       points to the string
3746      length       length of string, or -1 if the string is zero-terminated
3747    
3748    Returns:       < 0    if the string is a valid UTF-8 string
3749                   >= 0   otherwise; the value is the offset of the bad byte
3750    */
3751    
3752    static int
3753    valid_utf8(const uschar *string, int length)
3754    {
3755    register const uschar *p;
3756    
3757    if (length < 0)
3758      {
3759      for (p = string; *p != 0; p++);
3760      length = p - string;
3761      }
3762    
3763    for (p = string; length-- > 0; p++)
3764      {
3765      int ab;
3766      if (*p < 128) continue;
3767      if ((*p & 0xc0) != 0xc0) return p - string;
3768      ab = utf8_table4[*p & 0x3f];  /* Number of additional bytes */
3769      if (length < ab) return p - string;
3770      while (ab-- > 0)
3771        {
3772        if ((*(++p) & 0xc0) != 0x80) return p - string;
3773        length--;
3774        }
3775      }
3776    
3777    return -1;
3778    }
3779    #endif
3780    
3781    
3782    
3783  /*************************************************  /*************************************************
3784  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
3785  *************************************************/  *************************************************/
# Line 3793  if (erroroffset == NULL) Line 3846  if (erroroffset == NULL)
3846    
3847  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3848  utf8 = (options & PCRE_UTF8) != 0;  utf8 = (options & PCRE_UTF8) != 0;
3849    if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3850         (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
3851      {
3852      *errorptr = ERR44;
3853      return NULL;
3854      }
3855  #else  #else
3856  if ((options & PCRE_UTF8) != 0)  if ((options & PCRE_UTF8) != 0)
3857    {    {
# Line 3874  while ((c = *(++ptr)) != 0) Line 3933  while ((c = *(++ptr)) != 0)
3933      case '\\':      case '\\':
3934        {        {
3935        const uschar *save_ptr = ptr;        const uschar *save_ptr = ptr;
3936        c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);        c = check_escape(&ptr, errorptr, bracount, options, FALSE);
3937        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3938        if (c >= 0)        if (c >= 0)
3939          {          {
# Line 3910  while ((c = *(++ptr)) != 0) Line 3969  while ((c = *(++ptr)) != 0)
3969        if (refnum > compile_block.top_backref)        if (refnum > compile_block.top_backref)
3970          compile_block.top_backref = refnum;          compile_block.top_backref = refnum;
3971        length += 2;   /* For single back reference */        length += 2;   /* For single back reference */
3972        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2))
3973          {          {
3974          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
3975          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3976          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
3977            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 3942  while ((c = *(++ptr)) != 0) Line 4001  while ((c = *(++ptr)) != 0)
4001      class, or back reference. */      class, or back reference. */
4002    
4003      case '{':      case '{':
4004      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4005      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4006      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4007    
4008      /* These special cases just insert one extra opcode */      /* These special cases just insert one extra opcode */
# Line 4039  while ((c = *(++ptr)) != 0) Line 4098  while ((c = *(++ptr)) != 0)
4098  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4099          int prevchar = ptr[-1];          int prevchar = ptr[-1];
4100  #endif  #endif
4101          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,          int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
           &compile_block);  
4102          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4103    
4104          /* \b is backspace inside a class */          /* \b is backspace inside a class */
# Line 4151  while ((c = *(++ptr)) != 0) Line 4209  while ((c = *(++ptr)) != 0)
4209    
4210        /* A repeat needs either 1 or 5 bytes. */        /* A repeat needs either 1 or 5 bytes. */
4211    
4212        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4213          {          {
4214          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4215          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4216          if ((min == 0 && (max == 1 || max == -1)) ||          if ((min == 0 && (max == 1 || max == -1)) ||
4217            (min == 1 && max == -1))            (min == 1 && max == -1))
# Line 4505  while ((c = *(++ptr)) != 0) Line 4563  while ((c = *(++ptr)) != 0)
4563      /* Leave ptr at the final char; for read_repeat_counts this happens      /* Leave ptr at the final char; for read_repeat_counts this happens
4564      automatically; for the others we need an increment. */      automatically; for the others we need an increment. */
4565    
4566      if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))      if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4567        {        {
4568        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4569        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4570        }        }
4571      else if (c == '*') { min = 0; max = -1; ptr++; }      else if (c == '*') { min = 0; max = -1; ptr++; }
# Line 4596  while ((c = *(++ptr)) != 0) Line 4654  while ((c = *(++ptr)) != 0)
4654        if (c == '\\')        if (c == '\\')
4655          {          {
4656          const uschar *saveptr = ptr;          const uschar *saveptr = ptr;
4657          c = check_escape(&ptr, errorptr, bracount, options, FALSE,          c = check_escape(&ptr, errorptr, bracount, options, FALSE);
           &compile_block);  
4658          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;          if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4659          if (c < 0) { ptr = saveptr; break; }          if (c < 0) { ptr = saveptr; break; }
4660    
# Line 7307  if (extra_data != NULL) Line 7364  if (extra_data != NULL)
7364    {    {
7365    register unsigned int flags = extra_data->flags;    register unsigned int flags = extra_data->flags;
7366    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
7367      study = extra_data->study_data;      study = (const pcre_study_data *)extra_data->study_data;
7368    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
7369      match_block.match_limit = extra_data->match_limit;      match_block.match_limit = extra_data->match_limit;
7370    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
# Line 7340  match_block.recursive = NULL; Line 7397  match_block.recursive = NULL;
7397  match_block.lcc = re->tables + lcc_offset;  match_block.lcc = re->tables + lcc_offset;
7398  match_block.ctypes = re->tables + ctypes_offset;  match_block.ctypes = re->tables + ctypes_offset;
7399    
7400    /* Check a UTF-8 string if required. Unfortunately there's no way of passing
7401    back the character offset. */
7402    
7403    #ifdef SUPPORT_UTF8
7404    if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7405        valid_utf8((uschar *)subject, length) >= 0)
7406      return PCRE_ERROR_BADUTF8;
7407    #endif
7408    
7409  /* The ims options can vary during the matching as a result of the presence  /* The ims options can vary during the matching as a result of the presence
7410  of (?ims) items in the pattern. They are kept in a local variable so that  of (?ims) items in the pattern. They are kept in a local variable so that
7411  restoring at the exit of a group is easy. */  restoring at the exit of a group is easy. */

Legend:
Removed from v.69  
changed lines
  Added in v.71

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12