/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 92 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 93 by nigel, Sat Feb 24 21:41:42 2007 UTC
# Line 45  supporting functions. */ Line 45  supporting functions. */
45  #include "pcre_internal.h"  #include "pcre_internal.h"
46    
47    
48    /* Returns from set_start_bits() */
49    
50    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
51    
52    
53  /*************************************************  /*************************************************
54  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
55  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 77  if (caseless && (cd->ctypes[c] & ctype_l
77    
78    
79  /*************************************************  /*************************************************
80  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
81  *************************************************/  *************************************************/
82    
83  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
84  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
85  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
86    useful for parenthesized groups in patterns such as (a*)b where the group
87    provides some optional starting bytes but scanning must continue at the outer
88    level to find at least one mandatory byte. At the outermost level, this
89    function fails unless the result is SSB_DONE.
90    
91  Arguments:  Arguments:
92    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 95  Arguments:
95    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
96    cd           the block with char table pointers    cd           the block with char table pointers
97    
98  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
99                   SSB_DONE     => Found mandatory starting bytes
100                   SSB_CONTINUE => Found optional starting bytes
101  */  */
102    
103  static BOOL  static int
104  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
105    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
106  {  {
107  register int c;  register int c;
108    int yield = SSB_DONE;
109    
110  #if 0  #if 0
111  /* ========================================================================= */  /* ========================================================================= */
# Line 114  volatile int dummy; Line 126  volatile int dummy;
126    
127  do  do
128    {    {
129    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
130    BOOL try_next = TRUE;    BOOL try_next = TRUE;
131    
132    while (try_next)    while (try_next)    /* Loop for items in this branch */
133      {      {
134      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
135      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
136        {        {
137        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
138    
     else switch(*tcode)  
       {  
139        default:        default:
140        return FALSE;        return SSB_FAIL;
141    
142        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
143          bits from within the subpattern. If it can't find anything, we have to
144          give up. If it finds some mandatory character(s), we are done for this
145          branch. Otherwise, carry on scanning after the subpattern. */
146    
147          case OP_BRA:
148          case OP_SBRA:
149          case OP_CBRA:
150          case OP_SCBRA:
151          case OP_ONCE:
152          case OP_ASSERT:
153          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
154          if (rc == SSB_FAIL) return SSB_FAIL;
155          if (rc == SSB_DONE) try_next = FALSE; else
156            {
157            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
158            tcode += 1 + LINK_SIZE;
159            }
160          break;
161    
162        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
163        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
164          continue with the next alternative, but we have to arrange that the final
165          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
166          return SSB_CONTINUE: if this is the top level, that indicates failure,
167          but after a nested subpattern, it causes scanning to continue. */
168    
169          case OP_ALT:
170          yield = SSB_CONTINUE;
171          try_next = FALSE;
172        break;        break;
173    
174        /* Skip over extended extraction bracket number */        case OP_KET:
175          case OP_KETRMAX:
176          case OP_KETRMIN:
177          return SSB_CONTINUE;
178    
179        case OP_BRANUMBER:        /* Skip over callout */
180        tcode += 3;  
181          case OP_CALLOUT:
182          tcode += 2 + 2*LINK_SIZE;
183        break;        break;
184    
185        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 152  do Line 188  do
188        case OP_ASSERTBACK:        case OP_ASSERTBACK:
189        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
190        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
191        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
192        break;        break;
193    
194        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 166  do Line 202  do
202    
203        case OP_BRAZERO:        case OP_BRAZERO:
204        case OP_BRAMINZERO:        case OP_BRAMINZERO:
205        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
206          return FALSE;          return SSB_FAIL;
207  /* =========================================================================  /* =========================================================================
208        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
209        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
210        dummy = 1;        dummy = 1;
211    ========================================================================= */    ========================================================================= */
212        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
213        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
214        break;        break;
215    
216        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
217    
218        case OP_STAR:        case OP_STAR:
219        case OP_MINSTAR:        case OP_MINSTAR:
220          case OP_POSSTAR:
221        case OP_QUERY:        case OP_QUERY:
222        case OP_MINQUERY:        case OP_MINQUERY:
223          case OP_POSQUERY:
224        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
225        tcode += 2;        tcode += 2;
226  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
227        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
228            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
229  #endif  #endif
230        break;        break;
231    
# Line 194  do Line 233  do
233    
234        case OP_UPTO:        case OP_UPTO:
235        case OP_MINUPTO:        case OP_MINUPTO:
236          case OP_POSUPTO:
237        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
238        tcode += 4;        tcode += 4;
239  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
240        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
241            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
242  #endif  #endif
243        break;        break;
244    
# Line 210  do Line 251  do
251        case OP_CHARNC:        case OP_CHARNC:
252        case OP_PLUS:        case OP_PLUS:
253        case OP_MINPLUS:        case OP_MINPLUS:
254          case OP_POSPLUS:
255        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
256        try_next = FALSE;        try_next = FALSE;
257        break;        break;
# Line 283  do Line 325  do
325    
326        case OP_TYPEUPTO:        case OP_TYPEUPTO:
327        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
328          case OP_TYPEPOSUPTO:
329        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
330    
331        case OP_TYPESTAR:        case OP_TYPESTAR:
332        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
333          case OP_TYPEPOSSTAR:
334        case OP_TYPEQUERY:        case OP_TYPEQUERY:
335        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
336          case OP_TYPEPOSQUERY:
337        switch(tcode[1])        switch(tcode[1])
338          {          {
339          case OP_ANY:          case OP_ANY:
340          return FALSE;          return SSB_FAIL;
341    
342          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
343          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 418  do Line 463  do
463    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
464    }    }
465  while (*code == OP_ALT);  while (*code == OP_ALT);
466  return TRUE;  return yield;
467  }  }
468    
469    
# Line 492  compile_block.ctypes = tables + ctypes_o Line 537  compile_block.ctypes = tables + ctypes_o
537  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
538    
539  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
540  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
541    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
542    
543  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
544  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.92  
changed lines
  Added in v.93

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12