/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 199 by ph10, Tue Jul 31 14:39:09 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57  /*************************************************  /*************************************************
58  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
59  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 81  if (caseless && (cd->ctypes[c] & ctype_l
81    
82    
83  /*************************************************  /*************************************************
84  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
85  *************************************************/  *************************************************/
86    
87  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
88  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
89  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90    useful for parenthesized groups in patterns such as (a*)b where the group
91    provides some optional starting bytes but scanning must continue at the outer
92    level to find at least one mandatory byte. At the outermost level, this
93    function fails unless the result is SSB_DONE.
94    
95  Arguments:  Arguments:
96    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 99  Arguments:
99    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
100    cd           the block with char table pointers    cd           the block with char table pointers
101    
102  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
103                   SSB_DONE     => Found mandatory starting bytes
104                   SSB_CONTINUE => Found optional starting bytes
105  */  */
106    
107  static BOOL  static int
108  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
110  {  {
111  register int c;  register int c;
112    int yield = SSB_DONE;
113    
114    #if 0
115    /* ========================================================================= */
116    /* The following comment and code was inserted in January 1999. In May 2006,
117    when it was observed to cause compiler warnings about unused values, I took it
118    out again. If anybody is still using OS/2, they will have to put it back
119    manually. */
120    
121  /* This next statement and the later reference to dummy are here in order to  /* This next statement and the later reference to dummy are here in order to
122  trick the optimizer of the IBM C compiler for OS/2 into generating correct  trick the optimizer of the IBM C compiler for OS/2 into generating correct
# Line 102  disable optimization (in this module it Line 125  disable optimization (in this module it
125  the pcre module can use all the optimization it can get). */  the pcre module can use all the optimization it can get). */
126    
127  volatile int dummy;  volatile int dummy;
128    /* ========================================================================= */
129    #endif
130    
131  do  do
132    {    {
133    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134    BOOL try_next = TRUE;    BOOL try_next = TRUE;
135    
136    while (try_next)    while (try_next)    /* Loop for items in this branch */
137      {      {
138      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
139      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
140        {        {
141        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
142    
     else switch(*tcode)  
       {  
143        default:        default:
144        return FALSE;        return SSB_FAIL;
145    
146        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
147          bits from within the subpattern. If it can't find anything, we have to
148          give up. If it finds some mandatory character(s), we are done for this
149          branch. Otherwise, carry on scanning after the subpattern. */
150    
151          case OP_BRA:
152          case OP_SBRA:
153          case OP_CBRA:
154          case OP_SCBRA:
155          case OP_ONCE:
156          case OP_ASSERT:
157          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158          if (rc == SSB_FAIL) return SSB_FAIL;
159          if (rc == SSB_DONE) try_next = FALSE; else
160            {
161            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162            tcode += 1 + LINK_SIZE;
163            }
164          break;
165    
166        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
167        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
168          continue with the next alternative, but we have to arrange that the final
169          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170          return SSB_CONTINUE: if this is the top level, that indicates failure,
171          but after a nested subpattern, it causes scanning to continue. */
172    
173          case OP_ALT:
174          yield = SSB_CONTINUE;
175          try_next = FALSE;
176        break;        break;
177    
178        /* Skip over extended extraction bracket number */        case OP_KET:
179          case OP_KETRMAX:
180          case OP_KETRMIN:
181          return SSB_CONTINUE;
182    
183        case OP_BRANUMBER:        /* Skip over callout */
184        tcode += 3;  
185          case OP_CALLOUT:
186          tcode += 2 + 2*LINK_SIZE;
187        break;        break;
188    
189        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 143  do Line 192  do
192        case OP_ASSERTBACK:        case OP_ASSERTBACK:
193        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
194        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
196        break;        break;
197    
198        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 157  do Line 206  do
206    
207        case OP_BRAZERO:        case OP_BRAZERO:
208        case OP_BRAMINZERO:        case OP_BRAMINZERO:
209        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210          return FALSE;          return SSB_FAIL;
211    /* =========================================================================
212          See the comment at the head of this function concerning the next line,
213          which was an old fudge for the benefit of OS/2.
214        dummy = 1;        dummy = 1;
215      ========================================================================= */
216        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
218        break;        break;
219    
220        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
221    
222        case OP_STAR:        case OP_STAR:
223        case OP_MINSTAR:        case OP_MINSTAR:
224          case OP_POSSTAR:
225        case OP_QUERY:        case OP_QUERY:
226        case OP_MINQUERY:        case OP_MINQUERY:
227          case OP_POSQUERY:
228        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
229        tcode += 2;        tcode += 2;
230  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
231        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
232            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
233  #endif  #endif
234        break;        break;
235    
# Line 181  do Line 237  do
237    
238        case OP_UPTO:        case OP_UPTO:
239        case OP_MINUPTO:        case OP_MINUPTO:
240          case OP_POSUPTO:
241        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
242        tcode += 4;        tcode += 4;
243  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
244        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
245            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
246  #endif  #endif
247        break;        break;
248    
# Line 197  do Line 255  do
255        case OP_CHARNC:        case OP_CHARNC:
256        case OP_PLUS:        case OP_PLUS:
257        case OP_MINPLUS:        case OP_MINPLUS:
258          case OP_POSPLUS:
259        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
260        try_next = FALSE;        try_next = FALSE;
261        break;        break;
# Line 215  do Line 274  do
274        try_next = FALSE;        try_next = FALSE;
275        break;        break;
276    
277          /* The cbit_space table has vertical tab as whitespace; we have to
278          discard it. */
279    
280        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
281        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
282          start_bits[c] |= ~cd->cbits[c+cbit_space];          {
283            int d = cd->cbits[c+cbit_space];
284            if (c == 1) d &= ~0x08;
285            start_bits[c] |= ~d;
286            }
287        try_next = FALSE;        try_next = FALSE;
288        break;        break;
289    
290          /* The cbit_space table has vertical tab as whitespace; we have to
291          discard it. */
292    
293        case OP_WHITESPACE:        case OP_WHITESPACE:
294        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
295          start_bits[c] |= cd->cbits[c+cbit_space];          {
296            int d = cd->cbits[c+cbit_space];
297            if (c == 1) d &= ~0x08;
298            start_bits[c] |= d;
299            }
300        try_next = FALSE;        try_next = FALSE;
301        break;        break;
302    
# Line 256  do Line 329  do
329    
330        case OP_TYPEUPTO:        case OP_TYPEUPTO:
331        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
332          case OP_TYPEPOSUPTO:
333        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
334    
335        case OP_TYPESTAR:        case OP_TYPESTAR:
336        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
337          case OP_TYPEPOSSTAR:
338        case OP_TYPEQUERY:        case OP_TYPEQUERY:
339        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
340          case OP_TYPEPOSQUERY:
341        switch(tcode[1])        switch(tcode[1])
342          {          {
343          case OP_ANY:          case OP_ANY:
344          return FALSE;          return SSB_FAIL;
345    
346          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
347          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 277  do Line 353  do
353            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
354          break;          break;
355    
356            /* The cbit_space table has vertical tab as whitespace; we have to
357            discard it. */
358    
359          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
360          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
361            start_bits[c] |= ~cd->cbits[c+cbit_space];            {
362              int d = cd->cbits[c+cbit_space];
363              if (c == 1) d &= ~0x08;
364              start_bits[c] |= ~d;
365              }
366          break;          break;
367    
368            /* The cbit_space table has vertical tab as whitespace; we have to
369            discard it. */
370    
371          case OP_WHITESPACE:          case OP_WHITESPACE:
372          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
373            start_bits[c] |= cd->cbits[c+cbit_space];            {
374              int d = cd->cbits[c+cbit_space];
375              if (c == 1) d &= ~0x08;
376              start_bits[c] |= d;
377              }
378          break;          break;
379    
380          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 308  do Line 398  do
398        character with a value > 255. */        character with a value > 255. */
399    
400        case OP_NCLASS:        case OP_NCLASS:
401    #ifdef SUPPORT_UTF8
402        if (utf8)        if (utf8)
403          {          {
404          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
405          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
406          }          }
407    #endif
408        /* Fall through */        /* Fall through */
409    
410        case OP_CLASS:        case OP_CLASS:
# Line 325  do Line 417  do
417          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
418          characters in the range 128 - 255. */          characters in the range 128 - 255. */
419    
420    #ifdef SUPPORT_UTF8
421          if (utf8)          if (utf8)
422            {            {
423            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 342  do Line 435  do
435          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
436    
437          else          else
438    #endif
439            {            {
440            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
441            }            }
# Line 377  do Line 471  do
471    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
472    }    }
473  while (*code == OP_ALT);  while (*code == OP_ALT);
474  return TRUE;  return yield;
475  }  }
476    
477    
# Line 401  Returns: pointer to a pcre_extra bloc Line 495  Returns: pointer to a pcre_extra bloc
495              NULL on error or if no optimization possible              NULL on error or if no optimization possible
496  */  */
497    
498  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra *
499  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
500  {  {
501  uschar start_bits[32];  uschar start_bits[32];
502  pcre_extra *extra;  pcre_extra *extra;
503  pcre_study_data *study;  pcre_study_data *study;
504  const uschar *tables;  const uschar *tables;
505  const real_pcre *re = (const real_pcre *)external_re;  uschar *code;
 uschar *code = (uschar *)re + re->name_table_offset +  
   (re->name_count * re->name_entry_size);  
506  compile_data compile_block;  compile_data compile_block;
507    const real_pcre *re = (const real_pcre *)external_re;
508    
509  *errorptr = NULL;  *errorptr = NULL;
510    
# Line 427  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 520  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
520    return NULL;    return NULL;
521    }    }
522    
523    code = (uschar *)re + re->name_table_offset +
524      (re->name_count * re->name_entry_size);
525    
526  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
527  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", no further processing
528  at present. */  at present. */
# Line 449  compile_block.ctypes = tables + ctypes_o Line 545  compile_block.ctypes = tables + ctypes_o
545  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
546    
547  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
548  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
549    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
550    
551  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
552  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.87  
changed lines
  Added in v.199

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12