/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 747 by ph10, Tue Nov 15 17:35:10 2011 UTC revision 758 by ph10, Mon Nov 21 12:05:36 2011 UTC
# Line 412  static const char error_texts[] = Line 412  static const char error_texts[] =
412    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"    "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413    /* 70 */    /* 70 */
414    "internal error: unknown opcode in find_fixedlength()\0"    "internal error: unknown opcode in find_fixedlength()\0"
415      "\\N is not supported in a class\0"
416    ;    ;
417    
418  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 1528  Arguments: Line 1529  Arguments:
1529    
1530  Returns:   the fixed length,  Returns:   the fixed length,
1531               or -1 if there is no fixed length,               or -1 if there is no fixed length,
1532               or -2 if \C was encountered               or -2 if \C was encountered (in UTF-8 mode only)
1533               or -3 if an OP_RECURSE item was encountered and atend is FALSE               or -3 if an OP_RECURSE item was encountered and atend is FALSE
1534               or -4 if an unknown opcode was encountered (internal error)               or -4 if an unknown opcode was encountered (internal error)
1535  */  */
# Line 1702  for (;;) Line 1703  for (;;)
1703      cc++;      cc++;
1704      break;      break;
1705    
1706      /* The single-byte matcher isn't allowed */      /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1707        otherwise \C is coded as OP_ALLANY. */
1708    
1709      case OP_ANYBYTE:      case OP_ANYBYTE:
1710      return -2;      return -2;
# Line 3352  for (;; ptr++) Line 3354  for (;; ptr++)
3354        }        }
3355    
3356      *lengthptr += (int)(code - last_code);      *lengthptr += (int)(code - last_code);
3357      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3358          c));
3359    
3360      /* If "previous" is set and it is not at the start of the work space, move      /* If "previous" is set and it is not at the start of the work space, move
3361      it back to there, in order to avoid filling up the work space. Otherwise,      it back to there, in order to avoid filling up the work space. Otherwise,
# Line 3768  for (;; ptr++) Line 3771  for (;; ptr++)
3771          if (*errorcodeptr != 0) goto FAILED;          if (*errorcodeptr != 0) goto FAILED;
3772    
3773          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3774            else if (-c == ESC_N)            /* \N is not supported in a class */
3775              {
3776              *errorcodeptr = ERR71;
3777              goto FAILED;
3778              }
3779          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3780            {            {
3781            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
# Line 4424  for (;; ptr++) Line 4432  for (;; ptr++)
4432      past, but it no longer happens for non-repeated recursions. In fact, the      past, but it no longer happens for non-repeated recursions. In fact, the
4433      repeated ones could be re-implemented independently so as not to need this,      repeated ones could be re-implemented independently so as not to need this,
4434      but for the moment we rely on the code for repeating groups. */      but for the moment we rely on the code for repeating groups. */
4435    
4436      if (*previous == OP_RECURSE)      if (*previous == OP_RECURSE)
4437        {        {
4438        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);        memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
# Line 4982  for (;; ptr++) Line 4990  for (;; ptr++)
4990        ONCE brackets can be converted into non-capturing brackets, as the        ONCE brackets can be converted into non-capturing brackets, as the
4991        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
4992        deal with possessive ONCEs specially.        deal with possessive ONCEs specially.
4993    
4994        Otherwise, if the quantifier was possessive, we convert the BRA code to        Otherwise, when we are doing the actual compile phase, check to see
4995        the POS form, and the KET code to KETRPOS. (It turns out to be convenient        whether this group is one that could match an empty string. If so,
4996        at runtime to detect this kind of subpattern at both the start and at the        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4997        end.) The use of special opcodes makes it possible to reduce greatly the        that runtime checking can be done. [This check is also applied to ONCE
4998        stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,        groups at runtime, but in a different way.]
4999        convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that  
5000        the default action below, of wrapping everything inside atomic brackets,        Then, if the quantifier was possessive and the bracket is not a
5001        does not happen.        conditional, we convert the BRA code to the POS form, and the KET code to
5002          KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5003        Then, when we are doing the actual compile phase, check to see whether        subpattern at both the start and at the end.) The use of special opcodes
5004        this group is one that could match an empty string. If so, convert the        makes it possible to reduce greatly the stack usage in pcre_exec(). If
5005        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime        the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5006        checking can be done. [This check is also applied to ONCE groups at  
5007        runtime, but in a different way.] */        Then, if the minimum number of matches is 1 or 0, cancel the possessive
5008          flag so that the default action below, of wrapping everything inside
5009          atomic brackets, does not happen. When the minimum is greater than 1,
5010          there will be earlier copies of the group, and so we still have to wrap
5011          the whole thing. */
5012    
5013        else        else
5014          {          {
5015          uschar *ketcode = code - 1 - LINK_SIZE;          uschar *ketcode = code - 1 - LINK_SIZE;
5016          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
5017    
5018            /* Convert possessive ONCE brackets to non-capturing */
5019    
5020          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&          if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5021              possessive_quantifier) *bracode = OP_BRA;              possessive_quantifier) *bracode = OP_BRA;
5022    
5023            /* For non-possessive ONCE brackets, all we need to do is to
5024            set the KET. */
5025    
5026          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)          if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5027            *ketcode = OP_KETRMAX + repeat_type;            *ketcode = OP_KETRMAX + repeat_type;
5028    
5029            /* Handle non-ONCE brackets and possessive ONCEs (which have been
5030            converted to non-capturing above). */
5031    
5032          else          else
5033            {            {
5034            if (possessive_quantifier)            /* In the compile phase, check for empty string matching. */
5035              {  
             *bracode += 1;                   /* Switch to xxxPOS opcodes */  
             *ketcode = OP_KETRPOS;  
             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;  
             possessive_quantifier = FALSE;  
             }  
           else *ketcode = OP_KETRMAX + repeat_type;  
   
5036            if (lengthptr == NULL)            if (lengthptr == NULL)
5037              {              {
5038              uschar *scode = bracode;              uschar *scode = bracode;
# Line 5033  for (;; ptr++) Line 5047  for (;; ptr++)
5047                }                }
5048              while (*scode == OP_ALT);              while (*scode == OP_ALT);
5049              }              }
5050    
5051              /* Handle possessive quantifiers. */
5052    
5053              if (possessive_quantifier)
5054                {
5055                /* For COND brackets, we wrap the whole thing in a possessively
5056                repeated non-capturing bracket, because we have not invented POS
5057                versions of the COND opcodes. Because we are moving code along, we
5058                must ensure that any pending recursive references are updated. */
5059    
5060                if (*bracode == OP_COND || *bracode == OP_SCOND)
5061                  {
5062                  int nlen = (int)(code - bracode);
5063                  *code = OP_END;
5064                  adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
5065                  memmove(bracode + 1+LINK_SIZE, bracode, nlen);
5066                  code += 1 + LINK_SIZE;
5067                  nlen += 1 + LINK_SIZE;
5068                  *bracode = OP_BRAPOS;
5069                  *code++ = OP_KETRPOS;
5070                  PUTINC(code, 0, nlen);
5071                  PUT(bracode, 1, nlen);
5072                  }
5073    
5074                /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5075    
5076                else
5077                  {
5078                  *bracode += 1;              /* Switch to xxxPOS opcodes */
5079                  *ketcode = OP_KETRPOS;
5080                  }
5081    
5082                /* If the minimum is zero, mark it as possessive, then unset the
5083                possessive flag when the minimum is 0 or 1. */
5084    
5085                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5086                if (repeat_min < 2) possessive_quantifier = FALSE;
5087                }
5088    
5089              /* Non-possessive quantifier */
5090    
5091              else *ketcode = OP_KETRMAX + repeat_type;
5092            }            }
5093          }          }
5094        }        }
# Line 5059  for (;; ptr++) Line 5115  for (;; ptr++)
5115      notation is just syntactic sugar, taken from Sun's Java package, but the      notation is just syntactic sugar, taken from Sun's Java package, but the
5116      special opcodes can optimize it.      special opcodes can optimize it.
5117    
5118      Possessively repeated subpatterns have already been handled in the code      Some (but not all) possessively repeated subpatterns have already been
5119      just above, so possessive_quantifier is always FALSE for them at this      completely handled in the code just above. For them, possessive_quantifier
5120      stage.      is always FALSE at this stage.
5121    
5122      Note that the repeated item starts at tempcode, not at previous, which      Note that the repeated item starts at tempcode, not at previous, which
5123      might be the first part of a string whose (former) last char we repeated.      might be the first part of a string whose (former) last char we repeated.
# Line 5551  for (;; ptr++) Line 5607  for (;; ptr++)
5607    
5608          /* ------------------------------------------------------------ */          /* ------------------------------------------------------------ */
5609          case CHAR_C:                 /* Callout - may be followed by digits; */          case CHAR_C:                 /* Callout - may be followed by digits; */
5610          previous_callout = code;  /* Save for later completion */          previous_callout = code;     /* Save for later completion */
5611          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1;    /* Skip one item before completing */
5612          *code++ = OP_CALLOUT;          *code++ = OP_CALLOUT;
5613            {            {
5614            int n = 0;            int n = 0;
# Line 6429  for (;; ptr++) Line 6485  for (;; ptr++)
6485            }            }
6486          else          else
6487  #endif  #endif
6488            {          /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6489            so that it works in DFA mode and in lookbehinds. */
6490    
6491              {
6492            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;            previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6493            *code++ = -c;            *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
6494            }            }
6495          }          }
6496        continue;        continue;

Legend:
Removed from v.747  
changed lines
  Added in v.758

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12