/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 107 by ph10, Wed Mar 7 11:02:28 2007 UTC revision 461 by ph10, Mon Oct 5 10:59:35 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
# Line 50  supporting functions. */ Line 54  supporting functions. */
54  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_CBRA:
100        case OP_SCBRA:
101        case OP_BRA:
102        case OP_SBRA:
103        case OP_ONCE:
104        case OP_COND:
105        case OP_SCOND:
106        d = find_minlength(cc, startcode, options);
107        if (d < 0) return d;
108        branchlength += d;
109        do cc += GET(cc, 1); while (*cc == OP_ALT);
110        cc += 1 + LINK_SIZE;
111        break;
112    
113        /* Reached end of a branch; if it's a ket it is the end of a nested
114        call. If it's ALT it is an alternation in a nested call. If it is
115        END it's the end of the outer call. All can be handled by the same code. */
116    
117        case OP_ALT:
118        case OP_KET:
119        case OP_KETRMAX:
120        case OP_KETRMIN:
121        case OP_END:
122        if (length < 0 || (!had_recurse && branchlength < length))
123          length = branchlength;
124        if (*cc != OP_ALT) return length;
125        cc += 1 + LINK_SIZE;
126        branchlength = 0;
127        had_recurse = FALSE;
128        break;
129    
130        /* Skip over assertive subpatterns */
131    
132        case OP_ASSERT:
133        case OP_ASSERT_NOT:
134        case OP_ASSERTBACK:
135        case OP_ASSERTBACK_NOT:
136        do cc += GET(cc, 1); while (*cc == OP_ALT);
137        /* Fall through */
138    
139        /* Skip over things that don't match chars */
140    
141        case OP_REVERSE:
142        case OP_CREF:
143        case OP_NCREF:
144        case OP_RREF:
145        case OP_NRREF:
146        case OP_DEF:
147        case OP_OPT:
148        case OP_CALLOUT:
149        case OP_SOD:
150        case OP_SOM:
151        case OP_EOD:
152        case OP_EODN:
153        case OP_CIRC:
154        case OP_DOLL:
155        case OP_NOT_WORD_BOUNDARY:
156        case OP_WORD_BOUNDARY:
157        cc += _pcre_OP_lengths[*cc];
158        break;
159    
160        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
161    
162        case OP_BRAZERO:
163        case OP_BRAMINZERO:
164        case OP_SKIPZERO:
165        cc += _pcre_OP_lengths[*cc];
166        do cc += GET(cc, 1); while (*cc == OP_ALT);
167        cc += 1 + LINK_SIZE;
168        break;
169    
170        /* Handle literal characters and + repetitions */
171    
172        case OP_CHAR:
173        case OP_CHARNC:
174        case OP_NOT:
175        case OP_PLUS:
176        case OP_MINPLUS:
177        case OP_POSPLUS:
178        case OP_NOTPLUS:
179        case OP_NOTMINPLUS:
180        case OP_NOTPOSPLUS:
181        branchlength++;
182        cc += 2;
183    #ifdef SUPPORT_UTF8
184        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
185    #endif
186        break;
187    
188        case OP_TYPEPLUS:
189        case OP_TYPEMINPLUS:
190        case OP_TYPEPOSPLUS:
191        branchlength++;
192        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
193        break;
194    
195        /* Handle exact repetitions. The count is already in characters, but we
196        need to skip over a multibyte character in UTF8 mode.  */
197    
198        case OP_EXACT:
199        case OP_NOTEXACT:
200        branchlength += GET2(cc,1);
201        cc += 4;
202    #ifdef SUPPORT_UTF8
203        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
204    #endif
205        break;
206    
207        case OP_TYPEEXACT:
208        branchlength += GET2(cc,1);
209        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
210        break;
211    
212        /* Handle single-char non-literal matchers */
213    
214        case OP_PROP:
215        case OP_NOTPROP:
216        cc += 2;
217        /* Fall through */
218    
219        case OP_NOT_DIGIT:
220        case OP_DIGIT:
221        case OP_NOT_WHITESPACE:
222        case OP_WHITESPACE:
223        case OP_NOT_WORDCHAR:
224        case OP_WORDCHAR:
225        case OP_ANY:
226        case OP_ALLANY:
227        case OP_EXTUNI:
228        case OP_HSPACE:
229        case OP_NOT_HSPACE:
230        case OP_VSPACE:
231        case OP_NOT_VSPACE:
232        branchlength++;
233        cc++;
234        break;
235    
236        /* "Any newline" might match two characters */
237    
238        case OP_ANYNL:
239        branchlength += 2;
240        cc++;
241        break;
242    
243        /* The single-byte matcher means we can't proceed in UTF-8 mode */
244    
245        case OP_ANYBYTE:
246    #ifdef SUPPORT_UTF8
247        if (utf8) return -1;
248    #endif
249        branchlength++;
250        cc++;
251        break;
252    
253        /* For repeated character types, we have to test for \p and \P, which have
254        an extra two bytes of parameters. */
255    
256        case OP_TYPESTAR:
257        case OP_TYPEMINSTAR:
258        case OP_TYPEQUERY:
259        case OP_TYPEMINQUERY:
260        case OP_TYPEPOSSTAR:
261        case OP_TYPEPOSQUERY:
262        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
263        cc += _pcre_OP_lengths[op];
264        break;
265    
266        case OP_TYPEUPTO:
267        case OP_TYPEMINUPTO:
268        case OP_TYPEPOSUPTO:
269        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
270        cc += _pcre_OP_lengths[op];
271        break;
272    
273        /* Check a class for variable quantification */
274    
275    #ifdef SUPPORT_UTF8
276        case OP_XCLASS:
277        cc += GET(cc, 1) - 33;
278        /* Fall through */
279    #endif
280    
281        case OP_CLASS:
282        case OP_NCLASS:
283        cc += 33;
284    
285        switch (*cc)
286          {
287          case OP_CRPLUS:
288          case OP_CRMINPLUS:
289          branchlength++;
290          /* Fall through */
291    
292          case OP_CRSTAR:
293          case OP_CRMINSTAR:
294          case OP_CRQUERY:
295          case OP_CRMINQUERY:
296          cc++;
297          break;
298    
299          case OP_CRRANGE:
300          case OP_CRMINRANGE:
301          branchlength += GET2(cc,1);
302          cc += 5;
303          break;
304    
305          default:
306          branchlength++;
307          break;
308          }
309        break;
310    
311        /* Backreferences and subroutine calls are treated in the same way: we find
312        the minimum length for the subpattern. A recursion, however, causes an
313        a flag to be set that causes the length of this branch to be ignored. The
314        logic is that a recursion can only make sense if there is another
315        alternation that stops the recursing. That will provide the minimum length
316        (when no recursion happens). A backreference within the group that it is
317        referencing behaves in the same way. */
318    
319        case OP_REF:
320        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
321        if (cs == NULL) return -2;
322        do ce += GET(ce, 1); while (*ce == OP_ALT);
323        if (cc > cs && cc < ce)
324          {
325          d = 0;
326          had_recurse = TRUE;
327          }
328        else d = find_minlength(cs, startcode, options);
329        cc += 3;
330    
331        /* Handle repeated back references */
332    
333        switch (*cc)
334          {
335          case OP_CRSTAR:
336          case OP_CRMINSTAR:
337          case OP_CRQUERY:
338          case OP_CRMINQUERY:
339          min = 0;
340          cc++;
341          break;
342    
343          case OP_CRRANGE:
344          case OP_CRMINRANGE:
345          min = GET2(cc, 1);
346          cc += 5;
347          break;
348    
349          default:
350          min = 1;
351          break;
352          }
353    
354        branchlength += min * d;
355        break;
356    
357        case OP_RECURSE:
358        cs = ce = (uschar *)startcode + GET(cc, 1);
359        if (cs == NULL) return -2;
360        do ce += GET(ce, 1); while (*ce == OP_ALT);
361        if (cc > cs && cc < ce)
362          had_recurse = TRUE;
363        else
364          branchlength += find_minlength(cs, startcode, options);
365        cc += 1 + LINK_SIZE;
366        break;
367    
368        /* Anything else does not or need not match a character. We can get the
369        item's length from the table, but for those that can match zero occurrences
370        of a character, we must take special action for UTF-8 characters. */
371    
372        case OP_UPTO:
373        case OP_NOTUPTO:
374        case OP_MINUPTO:
375        case OP_NOTMINUPTO:
376        case OP_POSUPTO:
377        case OP_STAR:
378        case OP_MINSTAR:
379        case OP_NOTMINSTAR:
380        case OP_POSSTAR:
381        case OP_NOTPOSSTAR:
382        case OP_QUERY:
383        case OP_MINQUERY:
384        case OP_NOTMINQUERY:
385        case OP_POSQUERY:
386        case OP_NOTPOSQUERY:
387        cc += _pcre_OP_lengths[op];
388    #ifdef SUPPORT_UTF8
389        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
390    #endif
391        break;
392    
393        /* For the record, these are the opcodes that are matched by "default":
394        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
395        OP_THEN. */
396    
397        default:
398        cc += _pcre_OP_lengths[op];
399        break;
400        }
401      }
402    /* Control never gets here */
403    }
404    
405    
406    
407  /*************************************************  /*************************************************
408  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
409  *************************************************/  *************************************************/
# Line 213  do Line 567  do
567        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
568        break;        break;
569    
570          /* SKIPZERO skips the bracket. */
571    
572          case OP_SKIPZERO:
573          tcode++;
574          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
575          tcode += 1 + LINK_SIZE;
576          break;
577    
578        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
579    
580        case OP_STAR:        case OP_STAR:
# Line 337  do Line 699  do
699        switch(tcode[1])        switch(tcode[1])
700          {          {
701          case OP_ANY:          case OP_ANY:
702            case OP_ALLANY:
703          return SSB_FAIL;          return SSB_FAIL;
704    
705          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 394  do Line 757  do
757        character with a value > 255. */        character with a value > 255. */
758    
759        case OP_NCLASS:        case OP_NCLASS:
760  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
761        if (utf8)        if (utf8)
762          {          {
763          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
764          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
765          }          }
766  #endif  #endif
767        /* Fall through */        /* Fall through */
768    
769        case OP_CLASS:        case OP_CLASS:
# Line 431  do Line 794  do
794          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
795    
796          else          else
797  #endif  #endif
798            {            {
799            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
800            }            }
# Line 487  Arguments: Line 850  Arguments:
850              set NULL unless error              set NULL unless error
851    
852  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
853                appropriate flag set;                appropriate flags set;
854              NULL on error or if no optimization possible              NULL on error or if no optimization possible
855  */  */
856    
857  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
858  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
859  {  {
860    int min;
861    BOOL bits_set = FALSE;
862  uschar start_bits[32];  uschar start_bits[32];
863  pcre_extra *extra;  pcre_extra *extra;
864  pcre_study_data *study;  pcre_study_data *study;
# Line 520  code = (uschar *)re + re->name_table_off Line 885  code = (uschar *)re + re->name_table_off
885    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
886    
887  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
888  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
889  at present. */  seeking a list of starting bytes. */
890    
891  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
892    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
893      {
894      /* Set the character tables in the block that is passed around */
895    
896  /* Set the character tables in the block that is passed around */    tables = re->tables;
897      if (tables == NULL)
898        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
899        (void *)(&tables));
900    
901      compile_block.lcc = tables + lcc_offset;
902      compile_block.fcc = tables + fcc_offset;
903      compile_block.cbits = tables + cbits_offset;
904      compile_block.ctypes = tables + ctypes_offset;
905    
906      /* See if we can find a fixed set of initial characters for the pattern. */
907    
908      memset(start_bits, 0, 32 * sizeof(uschar));
909      bits_set = set_start_bits(code, start_bits,
910        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
911        &compile_block) == SSB_DONE;
912      }
913    
914    /* Find the minimum length of subject string. */
915    
916    min = find_minlength(code, code, re->options);
917    
918  tables = re->tables;  /* Return NULL if no optimization is possible. */
919  if (tables == NULL)  
920    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  if (!bits_set && min < 0) return NULL;
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
921    
922  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
923  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 565  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 940  extra->flags = PCRE_EXTRA_STUDY_DATA;
940  extra->study_data = study;  extra->study_data = study;
941    
942  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
943  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
944  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
945    if (bits_set)
946      {
947      study->flags |= PCRE_STUDY_MAPPED;
948      memcpy(study->start_bits, start_bits, sizeof(start_bits));
949      }
950    
951    if (min >= 0)
952      {
953      study->flags |= PCRE_STUDY_MINLEN;
954      study->minlength = min;
955      }
956    
957  return extra;  return extra;
958  }  }

Legend:
Removed from v.107  
changed lines
  Added in v.461

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12