/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 353 by ph10, Mon Jul 7 15:44:24 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57  /*************************************************  /*************************************************
58  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
59  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 81  if (caseless && (cd->ctypes[c] & ctype_l
81    
82    
83  /*************************************************  /*************************************************
84  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
85  *************************************************/  *************************************************/
86    
87  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
88  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
89  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90    useful for parenthesized groups in patterns such as (a*)b where the group
91    provides some optional starting bytes but scanning must continue at the outer
92    level to find at least one mandatory byte. At the outermost level, this
93    function fails unless the result is SSB_DONE.
94    
95  Arguments:  Arguments:
96    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 99  Arguments:
99    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
100    cd           the block with char table pointers    cd           the block with char table pointers
101    
102  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
103                   SSB_DONE     => Found mandatory starting bytes
104                   SSB_CONTINUE => Found optional starting bytes
105  */  */
106    
107  static BOOL  static int
108  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
110  {  {
111  register int c;  register int c;
112    int yield = SSB_DONE;
113    
114    #if 0
115    /* ========================================================================= */
116    /* The following comment and code was inserted in January 1999. In May 2006,
117    when it was observed to cause compiler warnings about unused values, I took it
118    out again. If anybody is still using OS/2, they will have to put it back
119    manually. */
120    
121  /* This next statement and the later reference to dummy are here in order to  /* This next statement and the later reference to dummy are here in order to
122  trick the optimizer of the IBM C compiler for OS/2 into generating correct  trick the optimizer of the IBM C compiler for OS/2 into generating correct
# Line 102  disable optimization (in this module it Line 125  disable optimization (in this module it
125  the pcre module can use all the optimization it can get). */  the pcre module can use all the optimization it can get). */
126    
127  volatile int dummy;  volatile int dummy;
128    /* ========================================================================= */
129    #endif
130    
131  do  do
132    {    {
133    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134    BOOL try_next = TRUE;    BOOL try_next = TRUE;
135    
136    while (try_next)    while (try_next)    /* Loop for items in this branch */
137      {      {
138      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
139      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
140        {        {
141        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
142    
     else switch(*tcode)  
       {  
143        default:        default:
144        return FALSE;        return SSB_FAIL;
145    
146        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
147          bits from within the subpattern. If it can't find anything, we have to
148          give up. If it finds some mandatory character(s), we are done for this
149          branch. Otherwise, carry on scanning after the subpattern. */
150    
151          case OP_BRA:
152          case OP_SBRA:
153          case OP_CBRA:
154          case OP_SCBRA:
155          case OP_ONCE:
156          case OP_ASSERT:
157          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158          if (rc == SSB_FAIL) return SSB_FAIL;
159          if (rc == SSB_DONE) try_next = FALSE; else
160            {
161            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162            tcode += 1 + LINK_SIZE;
163            }
164          break;
165    
166        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
167        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
168          continue with the next alternative, but we have to arrange that the final
169          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170          return SSB_CONTINUE: if this is the top level, that indicates failure,
171          but after a nested subpattern, it causes scanning to continue. */
172    
173          case OP_ALT:
174          yield = SSB_CONTINUE;
175          try_next = FALSE;
176        break;        break;
177    
178        /* Skip over extended extraction bracket number */        case OP_KET:
179          case OP_KETRMAX:
180          case OP_KETRMIN:
181          return SSB_CONTINUE;
182    
183        case OP_BRANUMBER:        /* Skip over callout */
184        tcode += 3;  
185          case OP_CALLOUT:
186          tcode += 2 + 2*LINK_SIZE;
187        break;        break;
188    
189        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 143  do Line 192  do
192        case OP_ASSERTBACK:        case OP_ASSERTBACK:
193        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
194        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
196        break;        break;
197    
198        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 157  do Line 206  do
206    
207        case OP_BRAZERO:        case OP_BRAZERO:
208        case OP_BRAMINZERO:        case OP_BRAMINZERO:
209        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210          return FALSE;          return SSB_FAIL;
211    /* =========================================================================
212          See the comment at the head of this function concerning the next line,
213          which was an old fudge for the benefit of OS/2.
214        dummy = 1;        dummy = 1;
215      ========================================================================= */
216          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217          tcode += 1 + LINK_SIZE;
218          break;
219    
220          /* SKIPZERO skips the bracket. */
221    
222          case OP_SKIPZERO:
223          tcode++;
224        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
225        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
226        break;        break;
227    
228        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
229    
230        case OP_STAR:        case OP_STAR:
231        case OP_MINSTAR:        case OP_MINSTAR:
232          case OP_POSSTAR:
233        case OP_QUERY:        case OP_QUERY:
234        case OP_MINQUERY:        case OP_MINQUERY:
235          case OP_POSQUERY:
236        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
237        tcode += 2;        tcode += 2;
238  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
239        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
240            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
241  #endif  #endif
242        break;        break;
243    
# Line 181  do Line 245  do
245    
246        case OP_UPTO:        case OP_UPTO:
247        case OP_MINUPTO:        case OP_MINUPTO:
248          case OP_POSUPTO:
249        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
250        tcode += 4;        tcode += 4;
251  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
252        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
253            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
254  #endif  #endif
255        break;        break;
256    
# Line 197  do Line 263  do
263        case OP_CHARNC:        case OP_CHARNC:
264        case OP_PLUS:        case OP_PLUS:
265        case OP_MINPLUS:        case OP_MINPLUS:
266          case OP_POSPLUS:
267        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
268        try_next = FALSE;        try_next = FALSE;
269        break;        break;
# Line 215  do Line 282  do
282        try_next = FALSE;        try_next = FALSE;
283        break;        break;
284    
285          /* The cbit_space table has vertical tab as whitespace; we have to
286          discard it. */
287    
288        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
289        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
290          start_bits[c] |= ~cd->cbits[c+cbit_space];          {
291            int d = cd->cbits[c+cbit_space];
292            if (c == 1) d &= ~0x08;
293            start_bits[c] |= ~d;
294            }
295        try_next = FALSE;        try_next = FALSE;
296        break;        break;
297    
298          /* The cbit_space table has vertical tab as whitespace; we have to
299          discard it. */
300    
301        case OP_WHITESPACE:        case OP_WHITESPACE:
302        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
303          start_bits[c] |= cd->cbits[c+cbit_space];          {
304            int d = cd->cbits[c+cbit_space];
305            if (c == 1) d &= ~0x08;
306            start_bits[c] |= d;
307            }
308        try_next = FALSE;        try_next = FALSE;
309        break;        break;
310    
# Line 256  do Line 337  do
337    
338        case OP_TYPEUPTO:        case OP_TYPEUPTO:
339        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
340          case OP_TYPEPOSUPTO:
341        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
342    
343        case OP_TYPESTAR:        case OP_TYPESTAR:
344        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
345          case OP_TYPEPOSSTAR:
346        case OP_TYPEQUERY:        case OP_TYPEQUERY:
347        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
348          case OP_TYPEPOSQUERY:
349        switch(tcode[1])        switch(tcode[1])
350          {          {
351          case OP_ANY:          case OP_ANY:
352          return FALSE;          case OP_ALLANY:
353            return SSB_FAIL;
354    
355          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
356          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 277  do Line 362  do
362            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
363          break;          break;
364    
365            /* The cbit_space table has vertical tab as whitespace; we have to
366            discard it. */
367    
368          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
369          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
370            start_bits[c] |= ~cd->cbits[c+cbit_space];            {
371              int d = cd->cbits[c+cbit_space];
372              if (c == 1) d &= ~0x08;
373              start_bits[c] |= ~d;
374              }
375          break;          break;
376    
377            /* The cbit_space table has vertical tab as whitespace; we have to
378            discard it. */
379    
380          case OP_WHITESPACE:          case OP_WHITESPACE:
381          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
382            start_bits[c] |= cd->cbits[c+cbit_space];            {
383              int d = cd->cbits[c+cbit_space];
384              if (c == 1) d &= ~0x08;
385              start_bits[c] |= d;
386              }
387          break;          break;
388    
389          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 308  do Line 407  do
407        character with a value > 255. */        character with a value > 255. */
408    
409        case OP_NCLASS:        case OP_NCLASS:
410    #ifdef SUPPORT_UTF8
411        if (utf8)        if (utf8)
412          {          {
413          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
414          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
415          }          }
416    #endif
417        /* Fall through */        /* Fall through */
418    
419        case OP_CLASS:        case OP_CLASS:
# Line 325  do Line 426  do
426          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
427          characters in the range 128 - 255. */          characters in the range 128 - 255. */
428    
429    #ifdef SUPPORT_UTF8
430          if (utf8)          if (utf8)
431            {            {
432            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 342  do Line 444  do
444          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
445    
446          else          else
447    #endif
448            {            {
449            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
450            }            }
# Line 377  do Line 480  do
480    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
481    }    }
482  while (*code == OP_ALT);  while (*code == OP_ALT);
483  return TRUE;  return yield;
484  }  }
485    
486    
# Line 401  Returns: pointer to a pcre_extra bloc Line 504  Returns: pointer to a pcre_extra bloc
504              NULL on error or if no optimization possible              NULL on error or if no optimization possible
505  */  */
506    
507  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra *
508  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
509  {  {
510  uschar start_bits[32];  uschar start_bits[32];
511  pcre_extra *extra;  pcre_extra *extra;
512  pcre_study_data *study;  pcre_study_data *study;
513  const uschar *tables;  const uschar *tables;
514  const real_pcre *re = (const real_pcre *)external_re;  uschar *code;
 uschar *code = (uschar *)re + re->name_table_offset +  
   (re->name_count * re->name_entry_size);  
515  compile_data compile_block;  compile_data compile_block;
516    const real_pcre *re = (const real_pcre *)external_re;
517    
518  *errorptr = NULL;  *errorptr = NULL;
519    
# Line 427  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 529  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
529    return NULL;    return NULL;
530    }    }
531    
532    code = (uschar *)re + re->name_table_offset +
533      (re->name_count * re->name_entry_size);
534    
535  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
536  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", no further processing
537  at present. */  at present. */
538    
539  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) != 0 ||
540        (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
541    return NULL;    return NULL;
542    
543  /* Set the character tables in the block that is passed around */  /* Set the character tables in the block that is passed around */
# Line 449  compile_block.ctypes = tables + ctypes_o Line 555  compile_block.ctypes = tables + ctypes_o
555  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
556    
557  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
558  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
559    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
560    
561  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
562  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.87  
changed lines
  Added in v.353

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12