/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 461 by ph10, Mon Oct 5 10:59:35 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_CBRA:
100        case OP_SCBRA:
101        case OP_BRA:
102        case OP_SBRA:
103        case OP_ONCE:
104        case OP_COND:
105        case OP_SCOND:
106        d = find_minlength(cc, startcode, options);
107        if (d < 0) return d;
108        branchlength += d;
109        do cc += GET(cc, 1); while (*cc == OP_ALT);
110        cc += 1 + LINK_SIZE;
111        break;
112    
113        /* Reached end of a branch; if it's a ket it is the end of a nested
114        call. If it's ALT it is an alternation in a nested call. If it is
115        END it's the end of the outer call. All can be handled by the same code. */
116    
117        case OP_ALT:
118        case OP_KET:
119        case OP_KETRMAX:
120        case OP_KETRMIN:
121        case OP_END:
122        if (length < 0 || (!had_recurse && branchlength < length))
123          length = branchlength;
124        if (*cc != OP_ALT) return length;
125        cc += 1 + LINK_SIZE;
126        branchlength = 0;
127        had_recurse = FALSE;
128        break;
129    
130        /* Skip over assertive subpatterns */
131    
132        case OP_ASSERT:
133        case OP_ASSERT_NOT:
134        case OP_ASSERTBACK:
135        case OP_ASSERTBACK_NOT:
136        do cc += GET(cc, 1); while (*cc == OP_ALT);
137        /* Fall through */
138    
139        /* Skip over things that don't match chars */
140    
141        case OP_REVERSE:
142        case OP_CREF:
143        case OP_NCREF:
144        case OP_RREF:
145        case OP_NRREF:
146        case OP_DEF:
147        case OP_OPT:
148        case OP_CALLOUT:
149        case OP_SOD:
150        case OP_SOM:
151        case OP_EOD:
152        case OP_EODN:
153        case OP_CIRC:
154        case OP_DOLL:
155        case OP_NOT_WORD_BOUNDARY:
156        case OP_WORD_BOUNDARY:
157        cc += _pcre_OP_lengths[*cc];
158        break;
159    
160        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
161    
162        case OP_BRAZERO:
163        case OP_BRAMINZERO:
164        case OP_SKIPZERO:
165        cc += _pcre_OP_lengths[*cc];
166        do cc += GET(cc, 1); while (*cc == OP_ALT);
167        cc += 1 + LINK_SIZE;
168        break;
169    
170        /* Handle literal characters and + repetitions */
171    
172        case OP_CHAR:
173        case OP_CHARNC:
174        case OP_NOT:
175        case OP_PLUS:
176        case OP_MINPLUS:
177        case OP_POSPLUS:
178        case OP_NOTPLUS:
179        case OP_NOTMINPLUS:
180        case OP_NOTPOSPLUS:
181        branchlength++;
182        cc += 2;
183    #ifdef SUPPORT_UTF8
184        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
185    #endif
186        break;
187    
188        case OP_TYPEPLUS:
189        case OP_TYPEMINPLUS:
190        case OP_TYPEPOSPLUS:
191        branchlength++;
192        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
193        break;
194    
195        /* Handle exact repetitions. The count is already in characters, but we
196        need to skip over a multibyte character in UTF8 mode.  */
197    
198        case OP_EXACT:
199        case OP_NOTEXACT:
200        branchlength += GET2(cc,1);
201        cc += 4;
202    #ifdef SUPPORT_UTF8
203        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
204    #endif
205        break;
206    
207        case OP_TYPEEXACT:
208        branchlength += GET2(cc,1);
209        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
210        break;
211    
212        /* Handle single-char non-literal matchers */
213    
214        case OP_PROP:
215        case OP_NOTPROP:
216        cc += 2;
217        /* Fall through */
218    
219        case OP_NOT_DIGIT:
220        case OP_DIGIT:
221        case OP_NOT_WHITESPACE:
222        case OP_WHITESPACE:
223        case OP_NOT_WORDCHAR:
224        case OP_WORDCHAR:
225        case OP_ANY:
226        case OP_ALLANY:
227        case OP_EXTUNI:
228        case OP_HSPACE:
229        case OP_NOT_HSPACE:
230        case OP_VSPACE:
231        case OP_NOT_VSPACE:
232        branchlength++;
233        cc++;
234        break;
235    
236        /* "Any newline" might match two characters */
237    
238        case OP_ANYNL:
239        branchlength += 2;
240        cc++;
241        break;
242    
243        /* The single-byte matcher means we can't proceed in UTF-8 mode */
244    
245        case OP_ANYBYTE:
246    #ifdef SUPPORT_UTF8
247        if (utf8) return -1;
248    #endif
249        branchlength++;
250        cc++;
251        break;
252    
253        /* For repeated character types, we have to test for \p and \P, which have
254        an extra two bytes of parameters. */
255    
256        case OP_TYPESTAR:
257        case OP_TYPEMINSTAR:
258        case OP_TYPEQUERY:
259        case OP_TYPEMINQUERY:
260        case OP_TYPEPOSSTAR:
261        case OP_TYPEPOSQUERY:
262        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
263        cc += _pcre_OP_lengths[op];
264        break;
265    
266        case OP_TYPEUPTO:
267        case OP_TYPEMINUPTO:
268        case OP_TYPEPOSUPTO:
269        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
270        cc += _pcre_OP_lengths[op];
271        break;
272    
273        /* Check a class for variable quantification */
274    
275    #ifdef SUPPORT_UTF8
276        case OP_XCLASS:
277        cc += GET(cc, 1) - 33;
278        /* Fall through */
279    #endif
280    
281        case OP_CLASS:
282        case OP_NCLASS:
283        cc += 33;
284    
285        switch (*cc)
286          {
287          case OP_CRPLUS:
288          case OP_CRMINPLUS:
289          branchlength++;
290          /* Fall through */
291    
292          case OP_CRSTAR:
293          case OP_CRMINSTAR:
294          case OP_CRQUERY:
295          case OP_CRMINQUERY:
296          cc++;
297          break;
298    
299          case OP_CRRANGE:
300          case OP_CRMINRANGE:
301          branchlength += GET2(cc,1);
302          cc += 5;
303          break;
304    
305          default:
306          branchlength++;
307          break;
308          }
309        break;
310    
311        /* Backreferences and subroutine calls are treated in the same way: we find
312        the minimum length for the subpattern. A recursion, however, causes an
313        a flag to be set that causes the length of this branch to be ignored. The
314        logic is that a recursion can only make sense if there is another
315        alternation that stops the recursing. That will provide the minimum length
316        (when no recursion happens). A backreference within the group that it is
317        referencing behaves in the same way. */
318    
319        case OP_REF:
320        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
321        if (cs == NULL) return -2;
322        do ce += GET(ce, 1); while (*ce == OP_ALT);
323        if (cc > cs && cc < ce)
324          {
325          d = 0;
326          had_recurse = TRUE;
327          }
328        else d = find_minlength(cs, startcode, options);
329        cc += 3;
330    
331        /* Handle repeated back references */
332    
333        switch (*cc)
334          {
335          case OP_CRSTAR:
336          case OP_CRMINSTAR:
337          case OP_CRQUERY:
338          case OP_CRMINQUERY:
339          min = 0;
340          cc++;
341          break;
342    
343          case OP_CRRANGE:
344          case OP_CRMINRANGE:
345          min = GET2(cc, 1);
346          cc += 5;
347          break;
348    
349          default:
350          min = 1;
351          break;
352          }
353    
354        branchlength += min * d;
355        break;
356    
357        case OP_RECURSE:
358        cs = ce = (uschar *)startcode + GET(cc, 1);
359        if (cs == NULL) return -2;
360        do ce += GET(ce, 1); while (*ce == OP_ALT);
361        if (cc > cs && cc < ce)
362          had_recurse = TRUE;
363        else
364          branchlength += find_minlength(cs, startcode, options);
365        cc += 1 + LINK_SIZE;
366        break;
367    
368        /* Anything else does not or need not match a character. We can get the
369        item's length from the table, but for those that can match zero occurrences
370        of a character, we must take special action for UTF-8 characters. */
371    
372        case OP_UPTO:
373        case OP_NOTUPTO:
374        case OP_MINUPTO:
375        case OP_NOTMINUPTO:
376        case OP_POSUPTO:
377        case OP_STAR:
378        case OP_MINSTAR:
379        case OP_NOTMINSTAR:
380        case OP_POSSTAR:
381        case OP_NOTPOSSTAR:
382        case OP_QUERY:
383        case OP_MINQUERY:
384        case OP_NOTMINQUERY:
385        case OP_POSQUERY:
386        case OP_NOTPOSQUERY:
387        cc += _pcre_OP_lengths[op];
388    #ifdef SUPPORT_UTF8
389        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
390    #endif
391        break;
392    
393        /* For the record, these are the opcodes that are matched by "default":
394        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
395        OP_THEN. */
396    
397        default:
398        cc += _pcre_OP_lengths[op];
399        break;
400        }
401      }
402    /* Control never gets here */
403    }
404    
405    
406    
407  /*************************************************  /*************************************************
408  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
409  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 431  if (caseless && (cd->ctypes[c] & ctype_l
431    
432    
433  /*************************************************  /*************************************************
434  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
435  *************************************************/  *************************************************/
436    
437  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
438  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
439  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
440    useful for parenthesized groups in patterns such as (a*)b where the group
441    provides some optional starting bytes but scanning must continue at the outer
442    level to find at least one mandatory byte. At the outermost level, this
443    function fails unless the result is SSB_DONE.
444    
445  Arguments:  Arguments:
446    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 449  Arguments:
449    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
450    cd           the block with char table pointers    cd           the block with char table pointers
451    
452  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
453                   SSB_DONE     => Found mandatory starting bytes
454                   SSB_CONTINUE => Found optional starting bytes
455  */  */
456    
457  static BOOL  static int
458  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
459    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
460  {  {
461  register int c;  register int c;
462    int yield = SSB_DONE;
463    
464  #if 0  #if 0
465  /* ========================================================================= */  /* ========================================================================= */
# Line 114  volatile int dummy; Line 480  volatile int dummy;
480    
481  do  do
482    {    {
483    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
484    BOOL try_next = TRUE;    BOOL try_next = TRUE;
485    
486    while (try_next)    while (try_next)    /* Loop for items in this branch */
487      {      {
488      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
489      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
490        {        {
491        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
492    
     else switch(*tcode)  
       {  
493        default:        default:
494        return FALSE;        return SSB_FAIL;
495    
496        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
497          bits from within the subpattern. If it can't find anything, we have to
498          give up. If it finds some mandatory character(s), we are done for this
499          branch. Otherwise, carry on scanning after the subpattern. */
500    
501          case OP_BRA:
502          case OP_SBRA:
503          case OP_CBRA:
504          case OP_SCBRA:
505          case OP_ONCE:
506          case OP_ASSERT:
507          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
508          if (rc == SSB_FAIL) return SSB_FAIL;
509          if (rc == SSB_DONE) try_next = FALSE; else
510            {
511            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
512            tcode += 1 + LINK_SIZE;
513            }
514          break;
515    
516        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
517        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
518          continue with the next alternative, but we have to arrange that the final
519          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
520          return SSB_CONTINUE: if this is the top level, that indicates failure,
521          but after a nested subpattern, it causes scanning to continue. */
522    
523          case OP_ALT:
524          yield = SSB_CONTINUE;
525          try_next = FALSE;
526        break;        break;
527    
528        /* Skip over extended extraction bracket number */        case OP_KET:
529          case OP_KETRMAX:
530          case OP_KETRMIN:
531          return SSB_CONTINUE;
532    
533        case OP_BRANUMBER:        /* Skip over callout */
534        tcode += 3;  
535          case OP_CALLOUT:
536          tcode += 2 + 2*LINK_SIZE;
537        break;        break;
538    
539        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 152  do Line 542  do
542        case OP_ASSERTBACK:        case OP_ASSERTBACK:
543        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
544        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
545        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
546        break;        break;
547    
548        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 166  do Line 556  do
556    
557        case OP_BRAZERO:        case OP_BRAZERO:
558        case OP_BRAMINZERO:        case OP_BRAMINZERO:
559        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
560          return FALSE;          return SSB_FAIL;
561  /* =========================================================================  /* =========================================================================
562        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
563        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
564        dummy = 1;        dummy = 1;
565    ========================================================================= */    ========================================================================= */
566        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
567        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
568          break;
569    
570          /* SKIPZERO skips the bracket. */
571    
572          case OP_SKIPZERO:
573          tcode++;
574          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
575          tcode += 1 + LINK_SIZE;
576        break;        break;
577    
578        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
579    
580        case OP_STAR:        case OP_STAR:
581        case OP_MINSTAR:        case OP_MINSTAR:
582          case OP_POSSTAR:
583        case OP_QUERY:        case OP_QUERY:
584        case OP_MINQUERY:        case OP_MINQUERY:
585          case OP_POSQUERY:
586        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
587        tcode += 2;        tcode += 2;
588  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
589        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
590            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
591  #endif  #endif
592        break;        break;
593    
# Line 194  do Line 595  do
595    
596        case OP_UPTO:        case OP_UPTO:
597        case OP_MINUPTO:        case OP_MINUPTO:
598          case OP_POSUPTO:
599        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
600        tcode += 4;        tcode += 4;
601  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
602        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
603            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
604  #endif  #endif
605        break;        break;
606    
# Line 210  do Line 613  do
613        case OP_CHARNC:        case OP_CHARNC:
614        case OP_PLUS:        case OP_PLUS:
615        case OP_MINPLUS:        case OP_MINPLUS:
616          case OP_POSPLUS:
617        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
618        try_next = FALSE;        try_next = FALSE;
619        break;        break;
# Line 283  do Line 687  do
687    
688        case OP_TYPEUPTO:        case OP_TYPEUPTO:
689        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
690          case OP_TYPEPOSUPTO:
691        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
692    
693        case OP_TYPESTAR:        case OP_TYPESTAR:
694        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
695          case OP_TYPEPOSSTAR:
696        case OP_TYPEQUERY:        case OP_TYPEQUERY:
697        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
698          case OP_TYPEPOSQUERY:
699        switch(tcode[1])        switch(tcode[1])
700          {          {
701          case OP_ANY:          case OP_ANY:
702          return FALSE;          case OP_ALLANY:
703            return SSB_FAIL;
704    
705          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
706          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 349  do Line 757  do
757        character with a value > 255. */        character with a value > 255. */
758    
759        case OP_NCLASS:        case OP_NCLASS:
760    #ifdef SUPPORT_UTF8
761        if (utf8)        if (utf8)
762          {          {
763          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
764          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
765          }          }
766    #endif
767        /* Fall through */        /* Fall through */
768    
769        case OP_CLASS:        case OP_CLASS:
# Line 366  do Line 776  do
776          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
777          characters in the range 128 - 255. */          characters in the range 128 - 255. */
778    
779    #ifdef SUPPORT_UTF8
780          if (utf8)          if (utf8)
781            {            {
782            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 383  do Line 794  do
794          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
795    
796          else          else
797    #endif
798            {            {
799            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
800            }            }
# Line 418  do Line 830  do
830    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
831    }    }
832  while (*code == OP_ALT);  while (*code == OP_ALT);
833  return TRUE;  return yield;
834  }  }
835    
836    
# Line 438  Arguments: Line 850  Arguments:
850              set NULL unless error              set NULL unless error
851    
852  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
853                appropriate flag set;                appropriate flags set;
854              NULL on error or if no optimization possible              NULL on error or if no optimization possible
855  */  */
856    
857  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
858  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
859  {  {
860    int min;
861    BOOL bits_set = FALSE;
862  uschar start_bits[32];  uschar start_bits[32];
863  pcre_extra *extra;  pcre_extra *extra;
864  pcre_study_data *study;  pcre_study_data *study;
# Line 471  code = (uschar *)re + re->name_table_off Line 885  code = (uschar *)re + re->name_table_off
885    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
886    
887  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
888  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
889  at present. */  seeking a list of starting bytes. */
890    
891  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
892    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
893      {
894      /* Set the character tables in the block that is passed around */
895    
896      tables = re->tables;
897      if (tables == NULL)
898        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
899        (void *)(&tables));
900    
901      compile_block.lcc = tables + lcc_offset;
902      compile_block.fcc = tables + fcc_offset;
903      compile_block.cbits = tables + cbits_offset;
904      compile_block.ctypes = tables + ctypes_offset;
905    
906      /* See if we can find a fixed set of initial characters for the pattern. */
907    
908      memset(start_bits, 0, 32 * sizeof(uschar));
909      bits_set = set_start_bits(code, start_bits,
910        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
911        &compile_block) == SSB_DONE;
912      }
913    
914    /* Find the minimum length of subject string. */
915    
916    min = find_minlength(code, code, re->options);
917    
918  /* Set the character tables in the block that is passed around */  /* Return NULL if no optimization is possible. */
919    
920  tables = re->tables;  if (!bits_set && min < 0) return NULL;
 if (tables == NULL)  
   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;  
921    
922  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
923  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 516  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 940  extra->flags = PCRE_EXTRA_STUDY_DATA;
940  extra->study_data = study;  extra->study_data = study;
941    
942  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
943  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
944  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
945    if (bits_set)
946      {
947      study->flags |= PCRE_STUDY_MAPPED;
948      memcpy(study->start_bits, start_bits, sizeof(start_bits));
949      }
950    
951    if (min >= 0)
952      {
953      study->flags |= PCRE_STUDY_MINLEN;
954      study->minlength = min;
955      }
956    
957  return extra;  return extra;
958  }  }

Legend:
Removed from v.91  
changed lines
  Added in v.461

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12