/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 342 by ph10, Sun Apr 20 17:10:13 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57  /*************************************************  /*************************************************
58  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
59  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 81  if (caseless && (cd->ctypes[c] & ctype_l
81    
82    
83  /*************************************************  /*************************************************
84  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
85  *************************************************/  *************************************************/
86    
87  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
88  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
89  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90    useful for parenthesized groups in patterns such as (a*)b where the group
91    provides some optional starting bytes but scanning must continue at the outer
92    level to find at least one mandatory byte. At the outermost level, this
93    function fails unless the result is SSB_DONE.
94    
95  Arguments:  Arguments:
96    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 99  Arguments:
99    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
100    cd           the block with char table pointers    cd           the block with char table pointers
101    
102  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
103                   SSB_DONE     => Found mandatory starting bytes
104                   SSB_CONTINUE => Found optional starting bytes
105  */  */
106    
107  static BOOL  static int
108  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
110  {  {
111  register int c;  register int c;
112    int yield = SSB_DONE;
113    
114    #if 0
115    /* ========================================================================= */
116    /* The following comment and code was inserted in January 1999. In May 2006,
117    when it was observed to cause compiler warnings about unused values, I took it
118    out again. If anybody is still using OS/2, they will have to put it back
119    manually. */
120    
121  /* This next statement and the later reference to dummy are here in order to  /* This next statement and the later reference to dummy are here in order to
122  trick the optimizer of the IBM C compiler for OS/2 into generating correct  trick the optimizer of the IBM C compiler for OS/2 into generating correct
# Line 102  disable optimization (in this module it Line 125  disable optimization (in this module it
125  the pcre module can use all the optimization it can get). */  the pcre module can use all the optimization it can get). */
126    
127  volatile int dummy;  volatile int dummy;
128    /* ========================================================================= */
129    #endif
130    
131  do  do
132    {    {
133    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134    BOOL try_next = TRUE;    BOOL try_next = TRUE;
135    
136    while (try_next)    while (try_next)    /* Loop for items in this branch */
137      {      {
138      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
139      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
140        {        {
141        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
142    
     else switch(*tcode)  
       {  
143        default:        default:
144        return FALSE;        return SSB_FAIL;
145    
146        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
147          bits from within the subpattern. If it can't find anything, we have to
148          give up. If it finds some mandatory character(s), we are done for this
149          branch. Otherwise, carry on scanning after the subpattern. */
150    
151          case OP_BRA:
152          case OP_SBRA:
153          case OP_CBRA:
154          case OP_SCBRA:
155          case OP_ONCE:
156          case OP_ASSERT:
157          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158          if (rc == SSB_FAIL) return SSB_FAIL;
159          if (rc == SSB_DONE) try_next = FALSE; else
160            {
161            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162            tcode += 1 + LINK_SIZE;
163            }
164          break;
165    
166        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
167        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
168          continue with the next alternative, but we have to arrange that the final
169          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170          return SSB_CONTINUE: if this is the top level, that indicates failure,
171          but after a nested subpattern, it causes scanning to continue. */
172    
173          case OP_ALT:
174          yield = SSB_CONTINUE;
175          try_next = FALSE;
176        break;        break;
177    
178        /* Skip over extended extraction bracket number */        case OP_KET:
179          case OP_KETRMAX:
180          case OP_KETRMIN:
181          return SSB_CONTINUE;
182    
183        case OP_BRANUMBER:        /* Skip over callout */
184        tcode += 3;  
185          case OP_CALLOUT:
186          tcode += 2 + 2*LINK_SIZE;
187        break;        break;
188    
189        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 143  do Line 192  do
192        case OP_ASSERTBACK:        case OP_ASSERTBACK:
193        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
194        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
196        break;        break;
197    
198        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 157  do Line 206  do
206    
207        case OP_BRAZERO:        case OP_BRAZERO:
208        case OP_BRAMINZERO:        case OP_BRAMINZERO:
209        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210          return FALSE;          return SSB_FAIL;
211    /* =========================================================================
212          See the comment at the head of this function concerning the next line,
213          which was an old fudge for the benefit of OS/2.
214        dummy = 1;        dummy = 1;
215      ========================================================================= */
216          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217          tcode += 1 + LINK_SIZE;
218          break;
219    
220          /* SKIPZERO skips the bracket. */
221    
222          case OP_SKIPZERO:
223        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
224        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
225        break;        break;
226    
227        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
228    
229        case OP_STAR:        case OP_STAR:
230        case OP_MINSTAR:        case OP_MINSTAR:
231          case OP_POSSTAR:
232        case OP_QUERY:        case OP_QUERY:
233        case OP_MINQUERY:        case OP_MINQUERY:
234          case OP_POSQUERY:
235        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
236        tcode += 2;        tcode += 2;
237  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
238        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
239            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
240  #endif  #endif
241        break;        break;
242    
# Line 181  do Line 244  do
244    
245        case OP_UPTO:        case OP_UPTO:
246        case OP_MINUPTO:        case OP_MINUPTO:
247          case OP_POSUPTO:
248        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
249        tcode += 4;        tcode += 4;
250  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
251        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
252            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
253  #endif  #endif
254        break;        break;
255    
# Line 197  do Line 262  do
262        case OP_CHARNC:        case OP_CHARNC:
263        case OP_PLUS:        case OP_PLUS:
264        case OP_MINPLUS:        case OP_MINPLUS:
265          case OP_POSPLUS:
266        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
267        try_next = FALSE;        try_next = FALSE;
268        break;        break;
# Line 215  do Line 281  do
281        try_next = FALSE;        try_next = FALSE;
282        break;        break;
283    
284          /* The cbit_space table has vertical tab as whitespace; we have to
285          discard it. */
286    
287        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
288        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
289          start_bits[c] |= ~cd->cbits[c+cbit_space];          {
290            int d = cd->cbits[c+cbit_space];
291            if (c == 1) d &= ~0x08;
292            start_bits[c] |= ~d;
293            }
294        try_next = FALSE;        try_next = FALSE;
295        break;        break;
296    
297          /* The cbit_space table has vertical tab as whitespace; we have to
298          discard it. */
299    
300        case OP_WHITESPACE:        case OP_WHITESPACE:
301        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
302          start_bits[c] |= cd->cbits[c+cbit_space];          {
303            int d = cd->cbits[c+cbit_space];
304            if (c == 1) d &= ~0x08;
305            start_bits[c] |= d;
306            }
307        try_next = FALSE;        try_next = FALSE;
308        break;        break;
309    
# Line 256  do Line 336  do
336    
337        case OP_TYPEUPTO:        case OP_TYPEUPTO:
338        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
339          case OP_TYPEPOSUPTO:
340        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
341    
342        case OP_TYPESTAR:        case OP_TYPESTAR:
343        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
344          case OP_TYPEPOSSTAR:
345        case OP_TYPEQUERY:        case OP_TYPEQUERY:
346        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
347          case OP_TYPEPOSQUERY:
348        switch(tcode[1])        switch(tcode[1])
349          {          {
350          case OP_ANY:          case OP_ANY:
351          return FALSE;          case OP_ALLANY:
352            return SSB_FAIL;
353    
354          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
355          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 277  do Line 361  do
361            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
362          break;          break;
363    
364            /* The cbit_space table has vertical tab as whitespace; we have to
365            discard it. */
366    
367          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
368          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
369            start_bits[c] |= ~cd->cbits[c+cbit_space];            {
370              int d = cd->cbits[c+cbit_space];
371              if (c == 1) d &= ~0x08;
372              start_bits[c] |= ~d;
373              }
374          break;          break;
375    
376            /* The cbit_space table has vertical tab as whitespace; we have to
377            discard it. */
378    
379          case OP_WHITESPACE:          case OP_WHITESPACE:
380          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
381            start_bits[c] |= cd->cbits[c+cbit_space];            {
382              int d = cd->cbits[c+cbit_space];
383              if (c == 1) d &= ~0x08;
384              start_bits[c] |= d;
385              }
386          break;          break;
387    
388          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 308  do Line 406  do
406        character with a value > 255. */        character with a value > 255. */
407    
408        case OP_NCLASS:        case OP_NCLASS:
409    #ifdef SUPPORT_UTF8
410        if (utf8)        if (utf8)
411          {          {
412          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
413          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
414          }          }
415    #endif
416        /* Fall through */        /* Fall through */
417    
418        case OP_CLASS:        case OP_CLASS:
# Line 325  do Line 425  do
425          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
426          characters in the range 128 - 255. */          characters in the range 128 - 255. */
427    
428    #ifdef SUPPORT_UTF8
429          if (utf8)          if (utf8)
430            {            {
431            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 342  do Line 443  do
443          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
444    
445          else          else
446    #endif
447            {            {
448            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
449            }            }
# Line 377  do Line 479  do
479    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
480    }    }
481  while (*code == OP_ALT);  while (*code == OP_ALT);
482  return TRUE;  return yield;
483  }  }
484    
485    
# Line 401  Returns: pointer to a pcre_extra bloc Line 503  Returns: pointer to a pcre_extra bloc
503              NULL on error or if no optimization possible              NULL on error or if no optimization possible
504  */  */
505    
506  PCRE_EXPORT pcre_extra *  PCRE_EXP_DEFN pcre_extra *
507  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
508  {  {
509  uschar start_bits[32];  uschar start_bits[32];
510  pcre_extra *extra;  pcre_extra *extra;
511  pcre_study_data *study;  pcre_study_data *study;
512  const uschar *tables;  const uschar *tables;
513  const real_pcre *re = (const real_pcre *)external_re;  uschar *code;
 uschar *code = (uschar *)re + re->name_table_offset +  
   (re->name_count * re->name_entry_size);  
514  compile_data compile_block;  compile_data compile_block;
515    const real_pcre *re = (const real_pcre *)external_re;
516    
517  *errorptr = NULL;  *errorptr = NULL;
518    
# Line 427  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 528  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
528    return NULL;    return NULL;
529    }    }
530    
531    code = (uschar *)re + re->name_table_offset +
532      (re->name_count * re->name_entry_size);
533    
534  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
535  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", no further processing
536  at present. */  at present. */
537    
538  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) != 0 ||
539        (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
540    return NULL;    return NULL;
541    
542  /* Set the character tables in the block that is passed around */  /* Set the character tables in the block that is passed around */
# Line 449  compile_block.ctypes = tables + ctypes_o Line 554  compile_block.ctypes = tables + ctypes_o
554  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
555    
556  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
557  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
558    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
559    
560  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
561  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.85  
changed lines
  Added in v.342

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12