/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 454 by ph10, Wed Jul 9 16:20:19 2008 UTC revision 455 by ph10, Sat Sep 26 19:12:32 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 54  supporting functions. */ Line 54  supporting functions. */
54  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_CBRA:
100        case OP_SCBRA:
101        case OP_BRA:
102        case OP_SBRA:
103        case OP_ONCE:
104        case OP_COND:
105        case OP_SCOND:
106        d = find_minlength(cc, startcode, options);
107        if (d < 0) return d;
108        branchlength += d;
109        do cc += GET(cc, 1); while (*cc == OP_ALT);
110        cc += 1 + LINK_SIZE;
111        break;
112    
113        /* Reached end of a branch; if it's a ket it is the end of a nested
114        call. If it's ALT it is an alternation in a nested call. If it is
115        END it's the end of the outer call. All can be handled by the same code. */
116    
117        case OP_ALT:
118        case OP_KET:
119        case OP_KETRMAX:
120        case OP_KETRMIN:
121        case OP_END:
122        if (length < 0 || (!had_recurse && branchlength < length))
123          length = branchlength;
124        if (*cc != OP_ALT) return length;
125        cc += 1 + LINK_SIZE;
126        branchlength = 0;
127        had_recurse = FALSE;
128        break;
129    
130        /* Skip over assertive subpatterns */
131    
132        case OP_ASSERT:
133        case OP_ASSERT_NOT:
134        case OP_ASSERTBACK:
135        case OP_ASSERTBACK_NOT:
136        do cc += GET(cc, 1); while (*cc == OP_ALT);
137        /* Fall through */
138    
139        /* Skip over things that don't match chars */
140    
141        case OP_REVERSE:
142        case OP_CREF:
143        case OP_RREF:
144        case OP_DEF:
145        case OP_OPT:
146        case OP_CALLOUT:
147        case OP_SOD:
148        case OP_SOM:
149        case OP_EOD:
150        case OP_EODN:
151        case OP_CIRC:
152        case OP_DOLL:
153        case OP_NOT_WORD_BOUNDARY:
154        case OP_WORD_BOUNDARY:
155        cc += _pcre_OP_lengths[*cc];
156        break;
157    
158        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
159    
160        case OP_BRAZERO:
161        case OP_BRAMINZERO:
162        case OP_SKIPZERO:
163        cc += _pcre_OP_lengths[*cc];
164        do cc += GET(cc, 1); while (*cc == OP_ALT);
165        cc += 1 + LINK_SIZE;
166        break;
167    
168        /* Handle literal characters and + repetitions */
169    
170        case OP_CHAR:
171        case OP_CHARNC:
172        case OP_NOT:
173        case OP_PLUS:
174        case OP_MINPLUS:
175        case OP_POSPLUS:
176        case OP_NOTPLUS:
177        case OP_NOTMINPLUS:
178        case OP_NOTPOSPLUS:
179        branchlength++;
180        cc += 2;
181    #ifdef SUPPORT_UTF8
182        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
183    #endif
184        break;
185    
186        case OP_TYPEPLUS:
187        case OP_TYPEMINPLUS:
188        case OP_TYPEPOSPLUS:
189        branchlength++;
190        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
191        break;
192    
193        /* Handle exact repetitions. The count is already in characters, but we
194        need to skip over a multibyte character in UTF8 mode.  */
195    
196        case OP_EXACT:
197        case OP_NOTEXACT:
198        branchlength += GET2(cc,1);
199        cc += 4;
200    #ifdef SUPPORT_UTF8
201        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
202    #endif
203        break;
204    
205        case OP_TYPEEXACT:
206        branchlength += GET2(cc,1);
207        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
208        break;
209    
210        /* Handle single-char non-literal matchers */
211    
212        case OP_PROP:
213        case OP_NOTPROP:
214        cc += 2;
215        /* Fall through */
216    
217        case OP_NOT_DIGIT:
218        case OP_DIGIT:
219        case OP_NOT_WHITESPACE:
220        case OP_WHITESPACE:
221        case OP_NOT_WORDCHAR:
222        case OP_WORDCHAR:
223        case OP_ANY:
224        case OP_ALLANY:
225        case OP_EXTUNI:
226        case OP_HSPACE:
227        case OP_NOT_HSPACE:
228        case OP_VSPACE:
229        case OP_NOT_VSPACE:
230        branchlength++;
231        cc++;
232        break;
233    
234        /* "Any newline" might match two characters */
235    
236        case OP_ANYNL:
237        branchlength += 2;
238        cc++;
239        break;
240    
241        /* The single-byte matcher means we can't proceed in UTF-8 mode */
242    
243        case OP_ANYBYTE:
244    #ifdef SUPPORT_UTF8
245        if (utf8) return -1;
246    #endif
247        branchlength++;
248        cc++;
249        break;
250    
251        /* For repeated character types, we have to test for \p and \P, which have
252        an extra two bytes of parameters. */
253    
254        case OP_TYPESTAR:
255        case OP_TYPEMINSTAR:
256        case OP_TYPEQUERY:
257        case OP_TYPEMINQUERY:
258        case OP_TYPEPOSSTAR:
259        case OP_TYPEPOSQUERY:
260        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
261        cc += _pcre_OP_lengths[op];
262        break;
263    
264        case OP_TYPEUPTO:
265        case OP_TYPEMINUPTO:
266        case OP_TYPEPOSUPTO:
267        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
268        cc += _pcre_OP_lengths[op];
269        break;
270    
271        /* Check a class for variable quantification */
272    
273    #ifdef SUPPORT_UTF8
274        case OP_XCLASS:
275        cc += GET(cc, 1) - 33;
276        /* Fall through */
277    #endif
278    
279        case OP_CLASS:
280        case OP_NCLASS:
281        cc += 33;
282    
283        switch (*cc)
284          {
285          case OP_CRPLUS:
286          case OP_CRMINPLUS:
287          branchlength++;
288          /* Fall through */
289    
290          case OP_CRSTAR:
291          case OP_CRMINSTAR:
292          case OP_CRQUERY:
293          case OP_CRMINQUERY:
294          cc++;
295          break;
296    
297          case OP_CRRANGE:
298          case OP_CRMINRANGE:
299          branchlength += GET2(cc,1);
300          cc += 5;
301          break;
302    
303          default:
304          branchlength++;
305          break;
306          }
307        break;
308    
309        /* Backreferences and subroutine calls are treated in the same way: we find
310        the minimum length for the subpattern. A recursion, however, causes an
311        a flag to be set that causes the length of this branch to be ignored. The
312        logic is that a recursion can only make sense if there is another
313        alternation that stops the recursing. That will provide the minimum length
314        (when no recursion happens). A backreference within the group that it is
315        referencing behaves in the same way. */
316    
317        case OP_REF:
318        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
319        if (cs == NULL) return -2;
320        do ce += GET(ce, 1); while (*ce == OP_ALT);
321        if (cc > cs && cc < ce)
322          {
323          d = 0;
324          had_recurse = TRUE;
325          }
326        else d = find_minlength(cs, startcode, options);
327        cc += 3;
328    
329        /* Handle repeated back references */
330    
331        switch (*cc)
332          {
333          case OP_CRSTAR:
334          case OP_CRMINSTAR:
335          case OP_CRQUERY:
336          case OP_CRMINQUERY:
337          min = 0;
338          cc++;
339          break;
340    
341          case OP_CRRANGE:
342          case OP_CRMINRANGE:
343          min = GET2(cc, 1);
344          cc += 5;
345          break;
346    
347          default:
348          min = 1;
349          break;
350          }
351    
352        branchlength += min * d;
353        break;
354    
355        case OP_RECURSE:
356        cs = ce = (uschar *)startcode + GET(cc, 1);
357        if (cs == NULL) return -2;
358        do ce += GET(ce, 1); while (*ce == OP_ALT);
359        if (cc > cs && cc < ce)
360          had_recurse = TRUE;
361        else
362          branchlength += find_minlength(cs, startcode, options);
363        cc += 1 + LINK_SIZE;
364        break;
365    
366        /* Anything else does not or need not match a character. We can get the
367        item's length from the table, but for those that can match zero occurrences
368        of a character, we must take special action for UTF-8 characters. */
369    
370        case OP_UPTO:
371        case OP_NOTUPTO:
372        case OP_MINUPTO:
373        case OP_NOTMINUPTO:
374        case OP_POSUPTO:
375        case OP_STAR:
376        case OP_MINSTAR:
377        case OP_NOTMINSTAR:
378        case OP_POSSTAR:
379        case OP_NOTPOSSTAR:
380        case OP_QUERY:
381        case OP_MINQUERY:
382        case OP_NOTMINQUERY:
383        case OP_POSQUERY:
384        case OP_NOTPOSQUERY:
385        cc += _pcre_OP_lengths[op];
386    #ifdef SUPPORT_UTF8
387        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
388    #endif
389        break;
390    
391        /* For the record, these are the opcodes that are matched by "default":
392        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
393        OP_THEN. */
394    
395        default:
396        cc += _pcre_OP_lengths[op];
397        break;
398        }
399      }
400    /* Control never gets here */
401    }
402    
403    
404    
405  /*************************************************  /*************************************************
406  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
407  *************************************************/  *************************************************/
# Line 500  Arguments: Line 848  Arguments:
848              set NULL unless error              set NULL unless error
849    
850  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
851                appropriate flag set;                appropriate flags set;
852              NULL on error or if no optimization possible              NULL on error or if no optimization possible
853  */  */
854    
855  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
856  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
857  {  {
858    int min;
859    BOOL bits_set = FALSE;
860  uschar start_bits[32];  uschar start_bits[32];
861  pcre_extra *extra;  pcre_extra *extra;
862  pcre_study_data *study;  pcre_study_data *study;
# Line 533  code = (uschar *)re + re->name_table_off Line 883  code = (uschar *)re + re->name_table_off
883    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
884    
885  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
886  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
887  at present. */  seeking a list of starting bytes. */
888    
889  if ((re->options & PCRE_ANCHORED) != 0 ||  if ((re->options & PCRE_ANCHORED) == 0 &&
890      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
891    return NULL;    {
892      /* Set the character tables in the block that is passed around */
893    
894      tables = re->tables;
895      if (tables == NULL)
896        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
897        (void *)(&tables));
898    
899      compile_block.lcc = tables + lcc_offset;
900      compile_block.fcc = tables + fcc_offset;
901      compile_block.cbits = tables + cbits_offset;
902      compile_block.ctypes = tables + ctypes_offset;
903    
904      /* See if we can find a fixed set of initial characters for the pattern. */
905    
906      memset(start_bits, 0, 32 * sizeof(uschar));
907      bits_set = set_start_bits(code, start_bits,
908        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
909        &compile_block) == SSB_DONE;
910      }
911    
912    /* Find the minimum length of subject string. */
913    
914    min = find_minlength(code, code, re->options);
915    
916  /* Set the character tables in the block that is passed around */  /* Return NULL if no optimization is possible. */
917    
918  tables = re->tables;  if (!bits_set && min < 0) return NULL;
 if (tables == NULL)  
   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
919    
920  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
921  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 579  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 938  extra->flags = PCRE_EXTRA_STUDY_DATA;
938  extra->study_data = study;  extra->study_data = study;
939    
940  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
941  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
942  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
943    if (bits_set)
944      {
945      study->flags |= PCRE_STUDY_MAPPED;
946      memcpy(study->start_bits, start_bits, sizeof(start_bits));
947      }
948    
949    if (min >= 0)
950      {
951      study->flags |= PCRE_STUDY_MINLEN;
952      study->minlength = min;
953      }
954    
955  return extra;  return extra;
956  }  }

Legend:
Removed from v.454  
changed lines
  Added in v.455

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12