/[pcre]/code/tags/pcre-8.10/pcre_study.c
ViewVC logotype

Diff of /code/tags/pcre-8.10/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 93 by nigel, Sat Feb 24 21:41:42 2007 UTC revision 510 by ph10, Sat Mar 27 17:45:29 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
# Line 50  supporting functions. */ Line 54  supporting functions. */
54  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_COND:
100        case OP_SCOND:
101    
102        /* If there is only one branch in a condition, the implied branch has zero
103        length, so we don't add anything. This covers the DEFINE "condition"
104        automatically. */
105    
106        cs = cc + GET(cc, 1);
107        if (*cs != OP_ALT)
108          {
109          cc = cs + 1 + LINK_SIZE;
110          break;
111          }
112    
113        /* Otherwise we can fall through and treat it the same as any other
114        subpattern. */
115    
116        case OP_CBRA:
117        case OP_SCBRA:
118        case OP_BRA:
119        case OP_SBRA:
120        case OP_ONCE:
121        d = find_minlength(cc, startcode, options);
122        if (d < 0) return d;
123        branchlength += d;
124        do cc += GET(cc, 1); while (*cc == OP_ALT);
125        cc += 1 + LINK_SIZE;
126        break;
127    
128        /* Reached end of a branch; if it's a ket it is the end of a nested
129        call. If it's ALT it is an alternation in a nested call. If it is
130        END it's the end of the outer call. All can be handled by the same code. */
131    
132        case OP_ALT:
133        case OP_KET:
134        case OP_KETRMAX:
135        case OP_KETRMIN:
136        case OP_END:
137        if (length < 0 || (!had_recurse && branchlength < length))
138          length = branchlength;
139        if (*cc != OP_ALT) return length;
140        cc += 1 + LINK_SIZE;
141        branchlength = 0;
142        had_recurse = FALSE;
143        break;
144    
145        /* Skip over assertive subpatterns */
146    
147        case OP_ASSERT:
148        case OP_ASSERT_NOT:
149        case OP_ASSERTBACK:
150        case OP_ASSERTBACK_NOT:
151        do cc += GET(cc, 1); while (*cc == OP_ALT);
152        /* Fall through */
153    
154        /* Skip over things that don't match chars */
155    
156        case OP_REVERSE:
157        case OP_CREF:
158        case OP_NCREF:
159        case OP_RREF:
160        case OP_NRREF:
161        case OP_DEF:
162        case OP_OPT:
163        case OP_CALLOUT:
164        case OP_SOD:
165        case OP_SOM:
166        case OP_EOD:
167        case OP_EODN:
168        case OP_CIRC:
169        case OP_DOLL:
170        case OP_NOT_WORD_BOUNDARY:
171        case OP_WORD_BOUNDARY:
172        cc += _pcre_OP_lengths[*cc];
173        break;
174    
175        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
176    
177        case OP_BRAZERO:
178        case OP_BRAMINZERO:
179        case OP_SKIPZERO:
180        cc += _pcre_OP_lengths[*cc];
181        do cc += GET(cc, 1); while (*cc == OP_ALT);
182        cc += 1 + LINK_SIZE;
183        break;
184    
185        /* Handle literal characters and + repetitions */
186    
187        case OP_CHAR:
188        case OP_CHARNC:
189        case OP_NOT:
190        case OP_PLUS:
191        case OP_MINPLUS:
192        case OP_POSPLUS:
193        case OP_NOTPLUS:
194        case OP_NOTMINPLUS:
195        case OP_NOTPOSPLUS:
196        branchlength++;
197        cc += 2;
198    #ifdef SUPPORT_UTF8
199        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
200    #endif
201        break;
202    
203        case OP_TYPEPLUS:
204        case OP_TYPEMINPLUS:
205        case OP_TYPEPOSPLUS:
206        branchlength++;
207        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
208        break;
209    
210        /* Handle exact repetitions. The count is already in characters, but we
211        need to skip over a multibyte character in UTF8 mode.  */
212    
213        case OP_EXACT:
214        case OP_NOTEXACT:
215        branchlength += GET2(cc,1);
216        cc += 4;
217    #ifdef SUPPORT_UTF8
218        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
219    #endif
220        break;
221    
222        case OP_TYPEEXACT:
223        branchlength += GET2(cc,1);
224        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
225        break;
226    
227        /* Handle single-char non-literal matchers */
228    
229        case OP_PROP:
230        case OP_NOTPROP:
231        cc += 2;
232        /* Fall through */
233    
234        case OP_NOT_DIGIT:
235        case OP_DIGIT:
236        case OP_NOT_WHITESPACE:
237        case OP_WHITESPACE:
238        case OP_NOT_WORDCHAR:
239        case OP_WORDCHAR:
240        case OP_ANY:
241        case OP_ALLANY:
242        case OP_EXTUNI:
243        case OP_HSPACE:
244        case OP_NOT_HSPACE:
245        case OP_VSPACE:
246        case OP_NOT_VSPACE:
247        branchlength++;
248        cc++;
249        break;
250    
251        /* "Any newline" might match two characters */
252    
253        case OP_ANYNL:
254        branchlength += 2;
255        cc++;
256        break;
257    
258        /* The single-byte matcher means we can't proceed in UTF-8 mode */
259    
260        case OP_ANYBYTE:
261    #ifdef SUPPORT_UTF8
262        if (utf8) return -1;
263    #endif
264        branchlength++;
265        cc++;
266        break;
267    
268        /* For repeated character types, we have to test for \p and \P, which have
269        an extra two bytes of parameters. */
270    
271        case OP_TYPESTAR:
272        case OP_TYPEMINSTAR:
273        case OP_TYPEQUERY:
274        case OP_TYPEMINQUERY:
275        case OP_TYPEPOSSTAR:
276        case OP_TYPEPOSQUERY:
277        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
278        cc += _pcre_OP_lengths[op];
279        break;
280    
281        case OP_TYPEUPTO:
282        case OP_TYPEMINUPTO:
283        case OP_TYPEPOSUPTO:
284        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
285        cc += _pcre_OP_lengths[op];
286        break;
287    
288        /* Check a class for variable quantification */
289    
290    #ifdef SUPPORT_UTF8
291        case OP_XCLASS:
292        cc += GET(cc, 1) - 33;
293        /* Fall through */
294    #endif
295    
296        case OP_CLASS:
297        case OP_NCLASS:
298        cc += 33;
299    
300        switch (*cc)
301          {
302          case OP_CRPLUS:
303          case OP_CRMINPLUS:
304          branchlength++;
305          /* Fall through */
306    
307          case OP_CRSTAR:
308          case OP_CRMINSTAR:
309          case OP_CRQUERY:
310          case OP_CRMINQUERY:
311          cc++;
312          break;
313    
314          case OP_CRRANGE:
315          case OP_CRMINRANGE:
316          branchlength += GET2(cc,1);
317          cc += 5;
318          break;
319    
320          default:
321          branchlength++;
322          break;
323          }
324        break;
325    
326        /* Backreferences and subroutine calls are treated in the same way: we find
327        the minimum length for the subpattern. A recursion, however, causes an
328        a flag to be set that causes the length of this branch to be ignored. The
329        logic is that a recursion can only make sense if there is another
330        alternation that stops the recursing. That will provide the minimum length
331        (when no recursion happens). A backreference within the group that it is
332        referencing behaves in the same way.
333    
334        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
335        matches an empty string (by default it causes a matching failure), so in
336        that case we must set the minimum length to zero. */
337    
338        case OP_REF:
339        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
340          {
341          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
342          if (cs == NULL) return -2;
343          do ce += GET(ce, 1); while (*ce == OP_ALT);
344          if (cc > cs && cc < ce)
345            {
346            d = 0;
347            had_recurse = TRUE;
348            }
349          else d = find_minlength(cs, startcode, options);
350          }
351        else d = 0;
352        cc += 3;
353    
354        /* Handle repeated back references */
355    
356        switch (*cc)
357          {
358          case OP_CRSTAR:
359          case OP_CRMINSTAR:
360          case OP_CRQUERY:
361          case OP_CRMINQUERY:
362          min = 0;
363          cc++;
364          break;
365    
366          case OP_CRRANGE:
367          case OP_CRMINRANGE:
368          min = GET2(cc, 1);
369          cc += 5;
370          break;
371    
372          default:
373          min = 1;
374          break;
375          }
376    
377        branchlength += min * d;
378        break;
379    
380        case OP_RECURSE:
381        cs = ce = (uschar *)startcode + GET(cc, 1);
382        if (cs == NULL) return -2;
383        do ce += GET(ce, 1); while (*ce == OP_ALT);
384        if (cc > cs && cc < ce)
385          had_recurse = TRUE;
386        else
387          branchlength += find_minlength(cs, startcode, options);
388        cc += 1 + LINK_SIZE;
389        break;
390    
391        /* Anything else does not or need not match a character. We can get the
392        item's length from the table, but for those that can match zero occurrences
393        of a character, we must take special action for UTF-8 characters. */
394    
395        case OP_UPTO:
396        case OP_NOTUPTO:
397        case OP_MINUPTO:
398        case OP_NOTMINUPTO:
399        case OP_POSUPTO:
400        case OP_STAR:
401        case OP_MINSTAR:
402        case OP_NOTMINSTAR:
403        case OP_POSSTAR:
404        case OP_NOTPOSSTAR:
405        case OP_QUERY:
406        case OP_MINQUERY:
407        case OP_NOTMINQUERY:
408        case OP_POSQUERY:
409        case OP_NOTPOSQUERY:
410        cc += _pcre_OP_lengths[op];
411    #ifdef SUPPORT_UTF8
412        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
413    #endif
414        break;
415    
416        /* Skip these, but we need to add in the name length. */
417    
418        case OP_MARK:
419        case OP_PRUNE_ARG:
420        case OP_SKIP_ARG:
421        case OP_THEN_ARG:
422        cc += _pcre_OP_lengths[op] + cc[1];
423        break;
424    
425        /* For the record, these are the opcodes that are matched by "default":
426        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
427        OP_THEN. */
428    
429        default:
430        cc += _pcre_OP_lengths[op];
431        break;
432        }
433      }
434    /* Control never gets here */
435    }
436    
437    
438    
439  /*************************************************  /*************************************************
440  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
441  *************************************************/  *************************************************/
# Line 67  Returns: nothing Line 453  Returns: nothing
453  */  */
454    
455  static void  static void
456  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
457      compile_data *cd)
458  {  {
459  start_bits[c/8] |= (1 << (c&7));  start_bits[c/8] |= (1 << (c&7));
460  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
# Line 213  do Line 600  do
600        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
601        break;        break;
602    
603          /* SKIPZERO skips the bracket. */
604    
605          case OP_SKIPZERO:
606          tcode++;
607          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
608          tcode += 1 + LINK_SIZE;
609          break;
610    
611        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
612    
613        case OP_STAR:        case OP_STAR:
# Line 221  do Line 616  do
616        case OP_QUERY:        case OP_QUERY:
617        case OP_MINQUERY:        case OP_MINQUERY:
618        case OP_POSQUERY:        case OP_POSQUERY:
619        set_bit(start_bits, tcode[1], caseless, cd);        set_table_bit(start_bits, tcode[1], caseless, cd);
620        tcode += 2;        tcode += 2;
621  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
622        if (utf8 && tcode[-1] >= 0xc0)        if (utf8 && tcode[-1] >= 0xc0)
# Line 234  do Line 629  do
629        case OP_UPTO:        case OP_UPTO:
630        case OP_MINUPTO:        case OP_MINUPTO:
631        case OP_POSUPTO:        case OP_POSUPTO:
632        set_bit(start_bits, tcode[3], caseless, cd);        set_table_bit(start_bits, tcode[3], caseless, cd);
633        tcode += 4;        tcode += 4;
634  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
635        if (utf8 && tcode[-1] >= 0xc0)        if (utf8 && tcode[-1] >= 0xc0)
# Line 252  do Line 647  do
647        case OP_PLUS:        case OP_PLUS:
648        case OP_MINPLUS:        case OP_MINPLUS:
649        case OP_POSPLUS:        case OP_POSPLUS:
650        set_bit(start_bits, tcode[1], caseless, cd);        set_table_bit(start_bits, tcode[1], caseless, cd);
651        try_next = FALSE;        try_next = FALSE;
652        break;        break;
653    
# Line 337  do Line 732  do
732        switch(tcode[1])        switch(tcode[1])
733          {          {
734          case OP_ANY:          case OP_ANY:
735            case OP_ALLANY:
736          return SSB_FAIL;          return SSB_FAIL;
737    
738          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
# Line 394  do Line 790  do
790        character with a value > 255. */        character with a value > 255. */
791    
792        case OP_NCLASS:        case OP_NCLASS:
793    #ifdef SUPPORT_UTF8
794        if (utf8)        if (utf8)
795          {          {
796          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
797          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
798          }          }
799    #endif
800        /* Fall through */        /* Fall through */
801    
802        case OP_CLASS:        case OP_CLASS:
# Line 411  do Line 809  do
809          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
810          characters in the range 128 - 255. */          characters in the range 128 - 255. */
811    
812    #ifdef SUPPORT_UTF8
813          if (utf8)          if (utf8)
814            {            {
815            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 428  do Line 827  do
827          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
828    
829          else          else
830    #endif
831            {            {
832            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
833            }            }
# Line 483  Arguments: Line 883  Arguments:
883              set NULL unless error              set NULL unless error
884    
885  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
886                appropriate flag set;                appropriate flags set;
887              NULL on error or if no optimization possible              NULL on error or if no optimization possible
888  */  */
889    
890  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
891  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
892  {  {
893    int min;
894    BOOL bits_set = FALSE;
895  uschar start_bits[32];  uschar start_bits[32];
896  pcre_extra *extra;  pcre_extra *extra;
897  pcre_study_data *study;  pcre_study_data *study;
# Line 516  code = (uschar *)re + re->name_table_off Line 918  code = (uschar *)re + re->name_table_off
918    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
919    
920  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
921  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
922  at present. */  seeking a list of starting bytes. */
923    
924  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
925    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
926      {
927      /* Set the character tables in the block that is passed around */
928    
929      tables = re->tables;
930      if (tables == NULL)
931        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
932        (void *)(&tables));
933    
934      compile_block.lcc = tables + lcc_offset;
935      compile_block.fcc = tables + fcc_offset;
936      compile_block.cbits = tables + cbits_offset;
937      compile_block.ctypes = tables + ctypes_offset;
938    
939      /* See if we can find a fixed set of initial characters for the pattern. */
940    
941      memset(start_bits, 0, 32 * sizeof(uschar));
942      bits_set = set_start_bits(code, start_bits,
943        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
944        &compile_block) == SSB_DONE;
945      }
946    
947  /* Set the character tables in the block that is passed around */  /* Find the minimum length of subject string. */
948    
949  tables = re->tables;  min = find_minlength(code, code, re->options);
950  if (tables == NULL)  
951    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  /* Return NULL if no optimization is possible. */
952    (void *)(&tables));  
953    if (!bits_set && min < 0) return NULL;
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
954    
955  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
956  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 561  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 973  extra->flags = PCRE_EXTRA_STUDY_DATA;
973  extra->study_data = study;  extra->study_data = study;
974    
975  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
976  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
977  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
978    if (bits_set)
979      {
980      study->flags |= PCRE_STUDY_MAPPED;
981      memcpy(study->start_bits, start_bits, sizeof(start_bits));
982      }
983    
984    if (min >= 0)
985      {
986      study->flags |= PCRE_STUDY_MINLEN;
987      study->minlength = min;
988      }
989    
990  return extra;  return extra;
991  }  }

Legend:
Removed from v.93  
changed lines
  Added in v.510

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12