/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 335 by ph10, Sat Apr 12 14:36:14 2008 UTC revision 475 by ph10, Sat Jan 2 18:21:30 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 54  supporting functions. */ Line 54  supporting functions. */
54  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_COND:
100        case OP_SCOND:
101    
102        /* If there is only one branch in a condition, the implied branch has zero
103        length, so we don't add anything. This covers the DEFINE "condition"
104        automatically. */
105    
106        cs = cc + GET(cc, 1);
107        if (*cs != OP_ALT)
108          {
109          cc = cs + 1 + LINK_SIZE;
110          break;
111          }
112    
113        /* Otherwise we can fall through and treat it the same as any other
114        subpattern. */
115    
116        case OP_CBRA:
117        case OP_SCBRA:
118        case OP_BRA:
119        case OP_SBRA:
120        case OP_ONCE:
121        d = find_minlength(cc, startcode, options);
122        if (d < 0) return d;
123        branchlength += d;
124        do cc += GET(cc, 1); while (*cc == OP_ALT);
125        cc += 1 + LINK_SIZE;
126        break;
127    
128        /* Reached end of a branch; if it's a ket it is the end of a nested
129        call. If it's ALT it is an alternation in a nested call. If it is
130        END it's the end of the outer call. All can be handled by the same code. */
131    
132        case OP_ALT:
133        case OP_KET:
134        case OP_KETRMAX:
135        case OP_KETRMIN:
136        case OP_END:
137        if (length < 0 || (!had_recurse && branchlength < length))
138          length = branchlength;
139        if (*cc != OP_ALT) return length;
140        cc += 1 + LINK_SIZE;
141        branchlength = 0;
142        had_recurse = FALSE;
143        break;
144    
145        /* Skip over assertive subpatterns */
146    
147        case OP_ASSERT:
148        case OP_ASSERT_NOT:
149        case OP_ASSERTBACK:
150        case OP_ASSERTBACK_NOT:
151        do cc += GET(cc, 1); while (*cc == OP_ALT);
152        /* Fall through */
153    
154        /* Skip over things that don't match chars */
155    
156        case OP_REVERSE:
157        case OP_CREF:
158        case OP_NCREF:
159        case OP_RREF:
160        case OP_NRREF:
161        case OP_DEF:
162        case OP_OPT:
163        case OP_CALLOUT:
164        case OP_SOD:
165        case OP_SOM:
166        case OP_EOD:
167        case OP_EODN:
168        case OP_CIRC:
169        case OP_DOLL:
170        case OP_NOT_WORD_BOUNDARY:
171        case OP_WORD_BOUNDARY:
172        cc += _pcre_OP_lengths[*cc];
173        break;
174    
175        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
176    
177        case OP_BRAZERO:
178        case OP_BRAMINZERO:
179        case OP_SKIPZERO:
180        cc += _pcre_OP_lengths[*cc];
181        do cc += GET(cc, 1); while (*cc == OP_ALT);
182        cc += 1 + LINK_SIZE;
183        break;
184    
185        /* Handle literal characters and + repetitions */
186    
187        case OP_CHAR:
188        case OP_CHARNC:
189        case OP_NOT:
190        case OP_PLUS:
191        case OP_MINPLUS:
192        case OP_POSPLUS:
193        case OP_NOTPLUS:
194        case OP_NOTMINPLUS:
195        case OP_NOTPOSPLUS:
196        branchlength++;
197        cc += 2;
198    #ifdef SUPPORT_UTF8
199        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
200    #endif
201        break;
202    
203        case OP_TYPEPLUS:
204        case OP_TYPEMINPLUS:
205        case OP_TYPEPOSPLUS:
206        branchlength++;
207        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
208        break;
209    
210        /* Handle exact repetitions. The count is already in characters, but we
211        need to skip over a multibyte character in UTF8 mode.  */
212    
213        case OP_EXACT:
214        case OP_NOTEXACT:
215        branchlength += GET2(cc,1);
216        cc += 4;
217    #ifdef SUPPORT_UTF8
218        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
219    #endif
220        break;
221    
222        case OP_TYPEEXACT:
223        branchlength += GET2(cc,1);
224        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
225        break;
226    
227        /* Handle single-char non-literal matchers */
228    
229        case OP_PROP:
230        case OP_NOTPROP:
231        cc += 2;
232        /* Fall through */
233    
234        case OP_NOT_DIGIT:
235        case OP_DIGIT:
236        case OP_NOT_WHITESPACE:
237        case OP_WHITESPACE:
238        case OP_NOT_WORDCHAR:
239        case OP_WORDCHAR:
240        case OP_ANY:
241        case OP_ALLANY:
242        case OP_EXTUNI:
243        case OP_HSPACE:
244        case OP_NOT_HSPACE:
245        case OP_VSPACE:
246        case OP_NOT_VSPACE:
247        branchlength++;
248        cc++;
249        break;
250    
251        /* "Any newline" might match two characters */
252    
253        case OP_ANYNL:
254        branchlength += 2;
255        cc++;
256        break;
257    
258        /* The single-byte matcher means we can't proceed in UTF-8 mode */
259    
260        case OP_ANYBYTE:
261    #ifdef SUPPORT_UTF8
262        if (utf8) return -1;
263    #endif
264        branchlength++;
265        cc++;
266        break;
267    
268        /* For repeated character types, we have to test for \p and \P, which have
269        an extra two bytes of parameters. */
270    
271        case OP_TYPESTAR:
272        case OP_TYPEMINSTAR:
273        case OP_TYPEQUERY:
274        case OP_TYPEMINQUERY:
275        case OP_TYPEPOSSTAR:
276        case OP_TYPEPOSQUERY:
277        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
278        cc += _pcre_OP_lengths[op];
279        break;
280    
281        case OP_TYPEUPTO:
282        case OP_TYPEMINUPTO:
283        case OP_TYPEPOSUPTO:
284        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
285        cc += _pcre_OP_lengths[op];
286        break;
287    
288        /* Check a class for variable quantification */
289    
290    #ifdef SUPPORT_UTF8
291        case OP_XCLASS:
292        cc += GET(cc, 1) - 33;
293        /* Fall through */
294    #endif
295    
296        case OP_CLASS:
297        case OP_NCLASS:
298        cc += 33;
299    
300        switch (*cc)
301          {
302          case OP_CRPLUS:
303          case OP_CRMINPLUS:
304          branchlength++;
305          /* Fall through */
306    
307          case OP_CRSTAR:
308          case OP_CRMINSTAR:
309          case OP_CRQUERY:
310          case OP_CRMINQUERY:
311          cc++;
312          break;
313    
314          case OP_CRRANGE:
315          case OP_CRMINRANGE:
316          branchlength += GET2(cc,1);
317          cc += 5;
318          break;
319    
320          default:
321          branchlength++;
322          break;
323          }
324        break;
325    
326        /* Backreferences and subroutine calls are treated in the same way: we find
327        the minimum length for the subpattern. A recursion, however, causes an
328        a flag to be set that causes the length of this branch to be ignored. The
329        logic is that a recursion can only make sense if there is another
330        alternation that stops the recursing. That will provide the minimum length
331        (when no recursion happens). A backreference within the group that it is
332        referencing behaves in the same way.
333    
334        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
335        matches an empty string (by default it causes a matching failure), so in
336        that case we must set the minimum length to zero. */
337    
338        case OP_REF:
339        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
340          {
341          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
342          if (cs == NULL) return -2;
343          do ce += GET(ce, 1); while (*ce == OP_ALT);
344          if (cc > cs && cc < ce)
345            {
346            d = 0;
347            had_recurse = TRUE;
348            }
349          else d = find_minlength(cs, startcode, options);
350          }
351        else d = 0;
352        cc += 3;
353    
354        /* Handle repeated back references */
355    
356        switch (*cc)
357          {
358          case OP_CRSTAR:
359          case OP_CRMINSTAR:
360          case OP_CRQUERY:
361          case OP_CRMINQUERY:
362          min = 0;
363          cc++;
364          break;
365    
366          case OP_CRRANGE:
367          case OP_CRMINRANGE:
368          min = GET2(cc, 1);
369          cc += 5;
370          break;
371    
372          default:
373          min = 1;
374          break;
375          }
376    
377        branchlength += min * d;
378        break;
379    
380        case OP_RECURSE:
381        cs = ce = (uschar *)startcode + GET(cc, 1);
382        if (cs == NULL) return -2;
383        do ce += GET(ce, 1); while (*ce == OP_ALT);
384        if (cc > cs && cc < ce)
385          had_recurse = TRUE;
386        else
387          branchlength += find_minlength(cs, startcode, options);
388        cc += 1 + LINK_SIZE;
389        break;
390    
391        /* Anything else does not or need not match a character. We can get the
392        item's length from the table, but for those that can match zero occurrences
393        of a character, we must take special action for UTF-8 characters. */
394    
395        case OP_UPTO:
396        case OP_NOTUPTO:
397        case OP_MINUPTO:
398        case OP_NOTMINUPTO:
399        case OP_POSUPTO:
400        case OP_STAR:
401        case OP_MINSTAR:
402        case OP_NOTMINSTAR:
403        case OP_POSSTAR:
404        case OP_NOTPOSSTAR:
405        case OP_QUERY:
406        case OP_MINQUERY:
407        case OP_NOTMINQUERY:
408        case OP_POSQUERY:
409        case OP_NOTPOSQUERY:
410        cc += _pcre_OP_lengths[op];
411    #ifdef SUPPORT_UTF8
412        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
413    #endif
414        break;
415    
416        /* For the record, these are the opcodes that are matched by "default":
417        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
418        OP_THEN. */
419    
420        default:
421        cc += _pcre_OP_lengths[op];
422        break;
423        }
424      }
425    /* Control never gets here */
426    }
427    
428    
429    
430  /*************************************************  /*************************************************
431  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
432  *************************************************/  *************************************************/
# Line 71  Returns: nothing Line 444  Returns: nothing
444  */  */
445    
446  static void  static void
447  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
448      compile_data *cd)
449  {  {
450  start_bits[c/8] |= (1 << (c&7));  start_bits[c/8] |= (1 << (c&7));
451  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
# Line 220  do Line 594  do
594        /* SKIPZERO skips the bracket. */        /* SKIPZERO skips the bracket. */
595    
596        case OP_SKIPZERO:        case OP_SKIPZERO:
597          tcode++;
598        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
599        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
600        break;        break;
# Line 232  do Line 607  do
607        case OP_QUERY:        case OP_QUERY:
608        case OP_MINQUERY:        case OP_MINQUERY:
609        case OP_POSQUERY:        case OP_POSQUERY:
610        set_bit(start_bits, tcode[1], caseless, cd);        set_table_bit(start_bits, tcode[1], caseless, cd);
611        tcode += 2;        tcode += 2;
612  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
613        if (utf8 && tcode[-1] >= 0xc0)        if (utf8 && tcode[-1] >= 0xc0)
# Line 245  do Line 620  do
620        case OP_UPTO:        case OP_UPTO:
621        case OP_MINUPTO:        case OP_MINUPTO:
622        case OP_POSUPTO:        case OP_POSUPTO:
623        set_bit(start_bits, tcode[3], caseless, cd);        set_table_bit(start_bits, tcode[3], caseless, cd);
624        tcode += 4;        tcode += 4;
625  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
626        if (utf8 && tcode[-1] >= 0xc0)        if (utf8 && tcode[-1] >= 0xc0)
# Line 263  do Line 638  do
638        case OP_PLUS:        case OP_PLUS:
639        case OP_MINPLUS:        case OP_MINPLUS:
640        case OP_POSPLUS:        case OP_POSPLUS:
641        set_bit(start_bits, tcode[1], caseless, cd);        set_table_bit(start_bits, tcode[1], caseless, cd);
642        try_next = FALSE;        try_next = FALSE;
643        break;        break;
644    
# Line 348  do Line 723  do
723        switch(tcode[1])        switch(tcode[1])
724          {          {
725          case OP_ANY:          case OP_ANY:
726            case OP_ALLANY:
727          return SSB_FAIL;          return SSB_FAIL;
728    
729          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 498  Arguments: Line 874  Arguments:
874              set NULL unless error              set NULL unless error
875    
876  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
877                appropriate flag set;                appropriate flags set;
878              NULL on error or if no optimization possible              NULL on error or if no optimization possible
879  */  */
880    
881  PCRE_EXP_DEFN pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
882  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
883  {  {
884    int min;
885    BOOL bits_set = FALSE;
886  uschar start_bits[32];  uschar start_bits[32];
887  pcre_extra *extra;  pcre_extra *extra;
888  pcre_study_data *study;  pcre_study_data *study;
# Line 531  code = (uschar *)re + re->name_table_off Line 909  code = (uschar *)re + re->name_table_off
909    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
910    
911  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
912  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
913  at present. */  seeking a list of starting bytes. */
914    
915  if ((re->options & PCRE_ANCHORED) != 0 ||  if ((re->options & PCRE_ANCHORED) == 0 &&
916      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
917    return NULL;    {
918      /* Set the character tables in the block that is passed around */
919    
920  /* Set the character tables in the block that is passed around */    tables = re->tables;
921      if (tables == NULL)
922        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
923        (void *)(&tables));
924    
925      compile_block.lcc = tables + lcc_offset;
926      compile_block.fcc = tables + fcc_offset;
927      compile_block.cbits = tables + cbits_offset;
928      compile_block.ctypes = tables + ctypes_offset;
929    
930      /* See if we can find a fixed set of initial characters for the pattern. */
931    
932      memset(start_bits, 0, 32 * sizeof(uschar));
933      bits_set = set_start_bits(code, start_bits,
934        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
935        &compile_block) == SSB_DONE;
936      }
937    
938    /* Find the minimum length of subject string. */
939    
940    min = find_minlength(code, code, re->options);
941    
942  tables = re->tables;  /* Return NULL if no optimization is possible. */
943  if (tables == NULL)  
944    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  if (!bits_set && min < 0) return NULL;
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
945    
946  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
947  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 577  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 964  extra->flags = PCRE_EXTRA_STUDY_DATA;
964  extra->study_data = study;  extra->study_data = study;
965    
966  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
967  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
968  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
969    if (bits_set)
970      {
971      study->flags |= PCRE_STUDY_MAPPED;
972      memcpy(study->start_bits, start_bits, sizeof(start_bits));
973      }
974    
975    if (min >= 0)
976      {
977      study->flags |= PCRE_STUDY_MINLEN;
978      study->minlength = min;
979      }
980    
981  return extra;  return extra;
982  }  }

Legend:
Removed from v.335  
changed lines
  Added in v.475

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12