/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 96 by nigel, Fri Mar 2 13:10:43 2007 UTC revision 605 by ph10, Fri Jun 3 18:18:30 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
55  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56    
57    
58    
59    /*************************************************
60    *   Find the minimum subject length for a group  *
61    *************************************************/
62    
63    /* Scan a parenthesized group and compute the minimum length of subject that
64    is needed to match it. This is a lower bound; it does not mean there is a
65    string of that length that matches. In UTF8 mode, the result is in characters
66    rather than bytes.
67    
68    Arguments:
69      code       pointer to start of group (the bracket)
70      startcode  pointer to start of the whole pattern
71      options    the compiling options
72    
73    Returns:   the minimum length
74               -1 if \C was encountered
75               -2 internal error (missing capturing bracket)
76               -3 internal error (opcode not listed)
77    */
78    
79    static int
80    find_minlength(const uschar *code, const uschar *startcode, int options)
81    {
82    int length = -1;
83    BOOL utf8 = (options & PCRE_UTF8) != 0;
84    BOOL had_recurse = FALSE;
85    register int branchlength = 0;
86    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87    
88    if (*code == OP_CBRA || *code == OP_SCBRA ||
89        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
90    
91    /* Scan along the opcodes for this branch. If we get to the end of the
92    branch, check the length against that of the other branches. */
93    
94    for (;;)
95      {
96      int d, min;
97      uschar *cs, *ce;
98      register int op = *cc;
99    
100      switch (op)
101        {
102        case OP_COND:
103        case OP_SCOND:
104    
105        /* If there is only one branch in a condition, the implied branch has zero
106        length, so we don't add anything. This covers the DEFINE "condition"
107        automatically. */
108    
109        cs = cc + GET(cc, 1);
110        if (*cs != OP_ALT)
111          {
112          cc = cs + 1 + LINK_SIZE;
113          break;
114          }
115    
116        /* Otherwise we can fall through and treat it the same as any other
117        subpattern. */
118    
119        case OP_CBRA:
120        case OP_SCBRA:
121        case OP_BRA:
122        case OP_SBRA:
123        case OP_CBRAPOS:
124        case OP_SCBRAPOS:
125        case OP_BRAPOS:
126        case OP_SBRAPOS:
127        case OP_ONCE:
128        d = find_minlength(cc, startcode, options);
129        if (d < 0) return d;
130        branchlength += d;
131        do cc += GET(cc, 1); while (*cc == OP_ALT);
132        cc += 1 + LINK_SIZE;
133        break;
134    
135        /* Reached end of a branch; if it's a ket it is the end of a nested
136        call. If it's ALT it is an alternation in a nested call. If it is
137        END it's the end of the outer call. All can be handled by the same code. */
138    
139        case OP_ALT:
140        case OP_KET:
141        case OP_KETRMAX:
142        case OP_KETRMIN:
143        case OP_KETRPOS:
144        case OP_END:
145        if (length < 0 || (!had_recurse && branchlength < length))
146          length = branchlength;
147        if (*cc != OP_ALT) return length;
148        cc += 1 + LINK_SIZE;
149        branchlength = 0;
150        had_recurse = FALSE;
151        break;
152    
153        /* Skip over assertive subpatterns */
154    
155        case OP_ASSERT:
156        case OP_ASSERT_NOT:
157        case OP_ASSERTBACK:
158        case OP_ASSERTBACK_NOT:
159        do cc += GET(cc, 1); while (*cc == OP_ALT);
160        /* Fall through */
161    
162        /* Skip over things that don't match chars */
163    
164        case OP_REVERSE:
165        case OP_CREF:
166        case OP_NCREF:
167        case OP_RREF:
168        case OP_NRREF:
169        case OP_DEF:
170        case OP_CALLOUT:
171        case OP_SOD:
172        case OP_SOM:
173        case OP_EOD:
174        case OP_EODN:
175        case OP_CIRC:
176        case OP_CIRCM:
177        case OP_DOLL:
178        case OP_DOLLM:
179        case OP_NOT_WORD_BOUNDARY:
180        case OP_WORD_BOUNDARY:
181        cc += _pcre_OP_lengths[*cc];
182        break;
183    
184        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
185    
186        case OP_BRAZERO:
187        case OP_BRAMINZERO:
188        case OP_BRAPOSZERO:
189        case OP_SKIPZERO:
190        cc += _pcre_OP_lengths[*cc];
191        do cc += GET(cc, 1); while (*cc == OP_ALT);
192        cc += 1 + LINK_SIZE;
193        break;
194    
195        /* Handle literal characters and + repetitions */
196    
197        case OP_CHAR:
198        case OP_CHARI:
199        case OP_NOT:
200        case OP_NOTI:
201        case OP_PLUS:
202        case OP_PLUSI:
203        case OP_MINPLUS:
204        case OP_MINPLUSI:
205        case OP_POSPLUS:
206        case OP_POSPLUSI:
207        case OP_NOTPLUS:
208        case OP_NOTPLUSI:
209        case OP_NOTMINPLUS:
210        case OP_NOTMINPLUSI:
211        case OP_NOTPOSPLUS:
212        case OP_NOTPOSPLUSI:
213        branchlength++;
214        cc += 2;
215    #ifdef SUPPORT_UTF8
216        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
217    #endif
218        break;
219    
220        case OP_TYPEPLUS:
221        case OP_TYPEMINPLUS:
222        case OP_TYPEPOSPLUS:
223        branchlength++;
224        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
225        break;
226    
227        /* Handle exact repetitions. The count is already in characters, but we
228        need to skip over a multibyte character in UTF8 mode.  */
229    
230        case OP_EXACT:
231        case OP_EXACTI:
232        case OP_NOTEXACT:
233        case OP_NOTEXACTI:
234        branchlength += GET2(cc,1);
235        cc += 4;
236    #ifdef SUPPORT_UTF8
237        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
238    #endif
239        break;
240    
241        case OP_TYPEEXACT:
242        branchlength += GET2(cc,1);
243        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
244        break;
245    
246        /* Handle single-char non-literal matchers */
247    
248        case OP_PROP:
249        case OP_NOTPROP:
250        cc += 2;
251        /* Fall through */
252    
253        case OP_NOT_DIGIT:
254        case OP_DIGIT:
255        case OP_NOT_WHITESPACE:
256        case OP_WHITESPACE:
257        case OP_NOT_WORDCHAR:
258        case OP_WORDCHAR:
259        case OP_ANY:
260        case OP_ALLANY:
261        case OP_EXTUNI:
262        case OP_HSPACE:
263        case OP_NOT_HSPACE:
264        case OP_VSPACE:
265        case OP_NOT_VSPACE:
266        branchlength++;
267        cc++;
268        break;
269    
270        /* "Any newline" might match two characters */
271    
272        case OP_ANYNL:
273        branchlength += 2;
274        cc++;
275        break;
276    
277        /* The single-byte matcher means we can't proceed in UTF-8 mode */
278    
279        case OP_ANYBYTE:
280    #ifdef SUPPORT_UTF8
281        if (utf8) return -1;
282    #endif
283        branchlength++;
284        cc++;
285        break;
286    
287        /* For repeated character types, we have to test for \p and \P, which have
288        an extra two bytes of parameters. */
289    
290        case OP_TYPESTAR:
291        case OP_TYPEMINSTAR:
292        case OP_TYPEQUERY:
293        case OP_TYPEMINQUERY:
294        case OP_TYPEPOSSTAR:
295        case OP_TYPEPOSQUERY:
296        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
297        cc += _pcre_OP_lengths[op];
298        break;
299    
300        case OP_TYPEUPTO:
301        case OP_TYPEMINUPTO:
302        case OP_TYPEPOSUPTO:
303        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
304        cc += _pcre_OP_lengths[op];
305        break;
306    
307        /* Check a class for variable quantification */
308    
309    #ifdef SUPPORT_UTF8
310        case OP_XCLASS:
311        cc += GET(cc, 1) - 33;
312        /* Fall through */
313    #endif
314    
315        case OP_CLASS:
316        case OP_NCLASS:
317        cc += 33;
318    
319        switch (*cc)
320          {
321          case OP_CRPLUS:
322          case OP_CRMINPLUS:
323          branchlength++;
324          /* Fall through */
325    
326          case OP_CRSTAR:
327          case OP_CRMINSTAR:
328          case OP_CRQUERY:
329          case OP_CRMINQUERY:
330          cc++;
331          break;
332    
333          case OP_CRRANGE:
334          case OP_CRMINRANGE:
335          branchlength += GET2(cc,1);
336          cc += 5;
337          break;
338    
339          default:
340          branchlength++;
341          break;
342          }
343        break;
344    
345        /* Backreferences and subroutine calls are treated in the same way: we find
346        the minimum length for the subpattern. A recursion, however, causes an
347        a flag to be set that causes the length of this branch to be ignored. The
348        logic is that a recursion can only make sense if there is another
349        alternation that stops the recursing. That will provide the minimum length
350        (when no recursion happens). A backreference within the group that it is
351        referencing behaves in the same way.
352    
353        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
354        matches an empty string (by default it causes a matching failure), so in
355        that case we must set the minimum length to zero. */
356    
357        case OP_REF:
358        case OP_REFI:
359        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
360          {
361          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
362          if (cs == NULL) return -2;
363          do ce += GET(ce, 1); while (*ce == OP_ALT);
364          if (cc > cs && cc < ce)
365            {
366            d = 0;
367            had_recurse = TRUE;
368            }
369          else d = find_minlength(cs, startcode, options);
370          }
371        else d = 0;
372        cc += 3;
373    
374        /* Handle repeated back references */
375    
376        switch (*cc)
377          {
378          case OP_CRSTAR:
379          case OP_CRMINSTAR:
380          case OP_CRQUERY:
381          case OP_CRMINQUERY:
382          min = 0;
383          cc++;
384          break;
385    
386          case OP_CRPLUS:
387          case OP_CRMINPLUS:
388          min = 1;
389          cc++;
390          break;
391    
392          case OP_CRRANGE:
393          case OP_CRMINRANGE:
394          min = GET2(cc, 1);
395          cc += 5;
396          break;
397    
398          default:
399          min = 1;
400          break;
401          }
402    
403        branchlength += min * d;
404        break;
405    
406        case OP_RECURSE:
407        cs = ce = (uschar *)startcode + GET(cc, 1);
408        if (cs == NULL) return -2;
409        do ce += GET(ce, 1); while (*ce == OP_ALT);
410        if (cc > cs && cc < ce)
411          had_recurse = TRUE;
412        else
413          branchlength += find_minlength(cs, startcode, options);
414        cc += 1 + LINK_SIZE;
415        break;
416    
417        /* Anything else does not or need not match a character. We can get the
418        item's length from the table, but for those that can match zero occurrences
419        of a character, we must take special action for UTF-8 characters. As it
420        happens, the "NOT" versions of these opcodes are used at present only for
421        ASCII characters, so they could be omitted from this list. However, in
422        future that may change, so we include them here so as not to leave a
423        gotcha for a future maintainer. */
424    
425        case OP_UPTO:
426        case OP_UPTOI:
427        case OP_NOTUPTO:
428        case OP_NOTUPTOI:
429        case OP_MINUPTO:
430        case OP_MINUPTOI:
431        case OP_NOTMINUPTO:
432        case OP_NOTMINUPTOI:
433        case OP_POSUPTO:
434        case OP_POSUPTOI:
435        case OP_NOTPOSUPTO:
436        case OP_NOTPOSUPTOI:
437    
438        case OP_STAR:
439        case OP_STARI:
440        case OP_NOTSTAR:
441        case OP_NOTSTARI:
442        case OP_MINSTAR:
443        case OP_MINSTARI:
444        case OP_NOTMINSTAR:
445        case OP_NOTMINSTARI:
446        case OP_POSSTAR:
447        case OP_POSSTARI:
448        case OP_NOTPOSSTAR:
449        case OP_NOTPOSSTARI:
450    
451        case OP_QUERY:
452        case OP_QUERYI:
453        case OP_NOTQUERY:
454        case OP_NOTQUERYI:
455        case OP_MINQUERY:
456        case OP_MINQUERYI:
457        case OP_NOTMINQUERY:
458        case OP_NOTMINQUERYI:
459        case OP_POSQUERY:
460        case OP_POSQUERYI:
461        case OP_NOTPOSQUERY:
462        case OP_NOTPOSQUERYI:
463    
464        cc += _pcre_OP_lengths[op];
465    #ifdef SUPPORT_UTF8
466        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
467    #endif
468        break;
469    
470        /* Skip these, but we need to add in the name length. */
471    
472        case OP_MARK:
473        case OP_PRUNE_ARG:
474        case OP_SKIP_ARG:
475        cc += _pcre_OP_lengths[op] + cc[1];
476        break;
477    
478        case OP_THEN_ARG:
479        cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
480        break;
481    
482        /* The remaining opcodes are just skipped over. */
483    
484        case OP_ACCEPT:
485        case OP_CLOSE:
486        case OP_COMMIT:
487        case OP_FAIL:
488        case OP_PRUNE:
489        case OP_SET_SOM:
490        case OP_SKIP:
491        case OP_THEN:
492        cc += _pcre_OP_lengths[op];
493        break;
494    
495        /* This should not occur: we list all opcodes explicitly so that when
496        new ones get added they are properly considered. */
497    
498        default:
499        return -3;
500        }
501      }
502    /* Control never gets here */
503    }
504    
505    
506    
507  /*************************************************  /*************************************************
508  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
509  *************************************************/  *************************************************/
510    
511  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
512  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
513    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
514    when Unicode property support is available.
515    
516  Arguments:  Arguments:
517    start_bits    points to the bit map    start_bits    points to the bit map
518    c             is the character    p             points to the character
519    caseless      the caseless flag    caseless      the caseless flag
520    cd            the block with char table pointers    cd            the block with char table pointers
521      utf8          TRUE for UTF-8 mode
522    
523  Returns:        nothing  Returns:        pointer after the character
524    */
525    
526    static const uschar *
527    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
528      compile_data *cd, BOOL utf8)
529    {
530    unsigned int c = *p;
531    
532    SET_BIT(c);
533    
534    #ifdef SUPPORT_UTF8
535    if (utf8 && c > 127)
536      {
537      GETCHARINC(c, p);
538    #ifdef SUPPORT_UCP
539      if (caseless)
540        {
541        uschar buff[8];
542        c = UCD_OTHERCASE(c);
543        (void)_pcre_ord2utf8(c, buff);
544        SET_BIT(buff[0]);
545        }
546    #endif
547      return p;
548      }
549    #endif
550    
551    /* Not UTF-8 mode, or character is less than 127. */
552    
553    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
554    return p + 1;
555    }
556    
557    
558    
559    /*************************************************
560    *     Set bits for a positive character type     *
561    *************************************************/
562    
563    /* This function sets starting bits for a character type. In UTF-8 mode, we can
564    only do a direct setting for bytes less than 128, as otherwise there can be
565    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
566    environment, the tables will only recognize ASCII characters anyway, but in at
567    least one Windows environment, some higher bytes bits were set in the tables.
568    So we deal with that case by considering the UTF-8 encoding.
569    
570    Arguments:
571      start_bits     the starting bitmap
572      cbit type      the type of character wanted
573      table_limit    32 for non-UTF-8; 16 for UTF-8
574      cd             the block with char table pointers
575    
576    Returns:         nothing
577  */  */
578    
579  static void  static void
580  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
581      compile_data *cd)
582  {  {
583  start_bits[c/8] |= (1 << (c&7));  register int c;
584  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
585    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit == 32) return;
586    for (c = 128; c < 256; c++)
587      {
588      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
589        {
590        uschar buff[8];
591        (void)_pcre_ord2utf8(c, buff);
592        SET_BIT(buff[0]);
593        }
594      }
595    }
596    
597    
598    /*************************************************
599    *     Set bits for a negative character type     *
600    *************************************************/
601    
602    /* This function sets starting bits for a negative character type such as \D.
603    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
604    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
605    Unlike in the positive case, where we can set appropriate starting bits for
606    specific high-valued UTF-8 characters, in this case we have to set the bits for
607    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
608    0xc0 (192) for simplicity.
609    
610    Arguments:
611      start_bits     the starting bitmap
612      cbit type      the type of character wanted
613      table_limit    32 for non-UTF-8; 16 for UTF-8
614      cd             the block with char table pointers
615    
616    Returns:         nothing
617    */
618    
619    static void
620    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
621      compile_data *cd)
622    {
623    register int c;
624    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
625    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
626  }  }
627    
628    
# Line 91  function fails unless the result is SSB_ Line 642  function fails unless the result is SSB_
642  Arguments:  Arguments:
643    code         points to an expression    code         points to an expression
644    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
   caseless     the current state of the caseless flag  
645    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
646    cd           the block with char table pointers    cd           the block with char table pointers
647    
648  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
649                 SSB_DONE     => Found mandatory starting bytes                 SSB_DONE     => Found mandatory starting bytes
650                 SSB_CONTINUE => Found optional starting bytes                 SSB_CONTINUE => Found optional starting bytes
651                   SSB_UNKNOWN  => Hit an unrecognized opcode
652  */  */
653    
654  static int  static int
655  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
656    BOOL utf8, compile_data *cd)    compile_data *cd)
657  {  {
658  register int c;  register int c;
659  int yield = SSB_DONE;  int yield = SSB_DONE;
660    int table_limit = utf8? 16:32;
661    
662  #if 0  #if 0
663  /* ========================================================================= */  /* ========================================================================= */
# Line 126  volatile int dummy; Line 678  volatile int dummy;
678    
679  do  do
680    {    {
   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;  
681    BOOL try_next = TRUE;    BOOL try_next = TRUE;
682      const uschar *tcode = code + 1 + LINK_SIZE;
683    
684      if (*code == OP_CBRA || *code == OP_SCBRA ||
685          *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
686    
687    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
688      {      {
689      int rc;      int rc;
690      switch(*tcode)      switch(*tcode)
691        {        {
692        /* Fail if we reach something we don't understand */        /* If we reach something we don't understand, it means a new opcode has
693          been created that hasn't been added to this code. Hopefully this problem
694          will be discovered during testing. */
695    
696        default:        default:
697          return SSB_UNKNOWN;
698    
699          /* Fail for a valid opcode that implies no starting bits. */
700    
701          case OP_ACCEPT:
702          case OP_ALLANY:
703          case OP_ANY:
704          case OP_ANYBYTE:
705          case OP_CIRC:
706          case OP_CIRCM:
707          case OP_CLOSE:
708          case OP_COMMIT:
709          case OP_COND:
710          case OP_CREF:
711          case OP_DEF:
712          case OP_DOLL:
713          case OP_DOLLM:
714          case OP_END:
715          case OP_EOD:
716          case OP_EODN:
717          case OP_EXTUNI:
718          case OP_FAIL:
719          case OP_MARK:
720          case OP_NCREF:
721          case OP_NOT:
722          case OP_NOTEXACT:
723          case OP_NOTEXACTI:
724          case OP_NOTI:
725          case OP_NOTMINPLUS:
726          case OP_NOTMINPLUSI:
727          case OP_NOTMINQUERY:
728          case OP_NOTMINQUERYI:
729          case OP_NOTMINSTAR:
730          case OP_NOTMINSTARI:
731          case OP_NOTMINUPTO:
732          case OP_NOTMINUPTOI:
733          case OP_NOTPLUS:
734          case OP_NOTPLUSI:
735          case OP_NOTPOSPLUS:
736          case OP_NOTPOSPLUSI:
737          case OP_NOTPOSQUERY:
738          case OP_NOTPOSQUERYI:
739          case OP_NOTPOSSTAR:
740          case OP_NOTPOSSTARI:
741          case OP_NOTPOSUPTO:
742          case OP_NOTPOSUPTOI:
743          case OP_NOTPROP:
744          case OP_NOTQUERY:
745          case OP_NOTQUERYI:
746          case OP_NOTSTAR:
747          case OP_NOTSTARI:
748          case OP_NOTUPTO:
749          case OP_NOTUPTOI:
750          case OP_NOT_HSPACE:
751          case OP_NOT_VSPACE:
752          case OP_NOT_WORD_BOUNDARY:
753          case OP_NRREF:
754          case OP_PROP:
755          case OP_PRUNE:
756          case OP_PRUNE_ARG:
757          case OP_RECURSE:
758          case OP_REF:
759          case OP_REFI:
760          case OP_REVERSE:
761          case OP_RREF:
762          case OP_SCOND:
763          case OP_SET_SOM:
764          case OP_SKIP:
765          case OP_SKIP_ARG:
766          case OP_SOD:
767          case OP_SOM:
768          case OP_THEN:
769          case OP_THEN_ARG:
770          case OP_WORD_BOUNDARY:
771          case OP_XCLASS:
772        return SSB_FAIL;        return SSB_FAIL;
773    
774        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
# Line 148  do Line 780  do
780        case OP_SBRA:        case OP_SBRA:
781        case OP_CBRA:        case OP_CBRA:
782        case OP_SCBRA:        case OP_SCBRA:
783          case OP_BRAPOS:
784          case OP_SBRAPOS:
785          case OP_CBRAPOS:
786          case OP_SCBRAPOS:
787        case OP_ONCE:        case OP_ONCE:
788        case OP_ASSERT:        case OP_ASSERT:
789        rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf8, cd);
790        if (rc == SSB_FAIL) return SSB_FAIL;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
791        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
792          {          {
793          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
# Line 174  do Line 810  do
810        case OP_KET:        case OP_KET:
811        case OP_KETRMAX:        case OP_KETRMAX:
812        case OP_KETRMIN:        case OP_KETRMIN:
813          case OP_KETRPOS:
814        return SSB_CONTINUE;        return SSB_CONTINUE;
815    
816        /* Skip over callout */        /* Skip over callout */
# Line 191  do Line 828  do
828        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
829        break;        break;
830    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
831        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
832    
833        case OP_BRAZERO:        case OP_BRAZERO:
834        case OP_BRAMINZERO:        case OP_BRAMINZERO:
835        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)        case OP_BRAPOSZERO:
836          return SSB_FAIL;        rc = set_start_bits(++tcode, start_bits, utf8, cd);
837          if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
838  /* =========================================================================  /* =========================================================================
839        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
840        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
# Line 213  do Line 844  do
844        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
845        break;        break;
846    
847          /* SKIPZERO skips the bracket. */
848    
849          case OP_SKIPZERO:
850          tcode++;
851          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
852          tcode += 1 + LINK_SIZE;
853          break;
854    
855        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
856    
857        case OP_STAR:        case OP_STAR:
# Line 221  do Line 860  do
860        case OP_QUERY:        case OP_QUERY:
861        case OP_MINQUERY:        case OP_MINQUERY:
862        case OP_POSQUERY:        case OP_POSQUERY:
863        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
864        tcode += 2;        break;
865  #ifdef SUPPORT_UTF8  
866        if (utf8 && tcode[-1] >= 0xc0)        case OP_STARI:
867          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINSTARI:
868  #endif        case OP_POSSTARI:
869          case OP_QUERYI:
870          case OP_MINQUERYI:
871          case OP_POSQUERYI:
872          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
873        break;        break;
874    
875        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 234  do Line 877  do
877        case OP_UPTO:        case OP_UPTO:
878        case OP_MINUPTO:        case OP_MINUPTO:
879        case OP_POSUPTO:        case OP_POSUPTO:
880        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
881        tcode += 4;        break;
882  #ifdef SUPPORT_UTF8  
883        if (utf8 && tcode[-1] >= 0xc0)        case OP_UPTOI:
884          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINUPTOI:
885  #endif        case OP_POSUPTOI:
886          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
887        break;        break;
888    
889        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
890    
891        case OP_EXACT:       /* Fall through */        case OP_EXACT:
892        tcode += 2;        tcode += 2;
893          /* Fall through */
894        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
895        case OP_PLUS:        case OP_PLUS:
896        case OP_MINPLUS:        case OP_MINPLUS:
897        case OP_POSPLUS:        case OP_POSPLUS:
898        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
899        try_next = FALSE;        try_next = FALSE;
900        break;        break;
901    
902        /* Single character type sets the bits and stops */        case OP_EXACTI:
903          tcode += 2;
904          /* Fall through */
905          case OP_CHARI:
906          case OP_PLUSI:
907          case OP_MINPLUSI:
908          case OP_POSPLUSI:
909          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
910          try_next = FALSE;
911          break;
912    
913          /* Special spacing and line-terminating items. These recognize specific
914          lists of characters. The difference between VSPACE and ANYNL is that the
915          latter can match the two-character CRLF sequence, but that is not
916          relevant for finding the first character, so their code here is
917          identical. */
918    
919          case OP_HSPACE:
920          SET_BIT(0x09);
921          SET_BIT(0x20);
922          if (utf8)
923            {
924            SET_BIT(0xC2);  /* For U+00A0 */
925            SET_BIT(0xE1);  /* For U+1680, U+180E */
926            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
927            SET_BIT(0xE3);  /* For U+3000 */
928            }
929          else SET_BIT(0xA0);
930          try_next = FALSE;
931          break;
932    
933          case OP_ANYNL:
934          case OP_VSPACE:
935          SET_BIT(0x0A);
936          SET_BIT(0x0B);
937          SET_BIT(0x0C);
938          SET_BIT(0x0D);
939          if (utf8)
940            {
941            SET_BIT(0xC2);  /* For U+0085 */
942            SET_BIT(0xE2);  /* For U+2028, U+2029 */
943            }
944          else SET_BIT(0x85);
945          try_next = FALSE;
946          break;
947    
948          /* Single character types set the bits and stop. Note that if PCRE_UCP
949          is set, we do not see these op codes because \d etc are converted to
950          properties. Therefore, these apply in the case when only characters less
951          than 256 are recognized to match the types. */
952    
953        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
954        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
955        try_next = FALSE;        try_next = FALSE;
956        break;        break;
957    
958        case OP_DIGIT:        case OP_DIGIT:
959        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
960        try_next = FALSE;        try_next = FALSE;
961        break;        break;
962    
963        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
964        discard it. */        ensure it is set as not whitespace. */
965    
966        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
967        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
968          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
969        try_next = FALSE;        try_next = FALSE;
970        break;        break;
971    
972        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
973        discard it. */        not set it from the table. */
974    
975        case OP_WHITESPACE:        case OP_WHITESPACE:
976        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
977          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
978          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
979        try_next = FALSE;        try_next = FALSE;
980        break;        break;
981    
982        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
983        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
984        try_next = FALSE;        try_next = FALSE;
985        break;        break;
986    
987        case OP_WORDCHAR:        case OP_WORDCHAR:
988        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
989        try_next = FALSE;        try_next = FALSE;
990        break;        break;
991    
# Line 313  do Line 994  do
994    
995        case OP_TYPEPLUS:        case OP_TYPEPLUS:
996        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
997          case OP_TYPEPOSPLUS:
998        tcode++;        tcode++;
999        break;        break;
1000    
# Line 336  do Line 1018  do
1018        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
1019        switch(tcode[1])        switch(tcode[1])
1020          {          {
1021            default:
1022          case OP_ANY:          case OP_ANY:
1023            case OP_ALLANY:
1024          return SSB_FAIL;          return SSB_FAIL;
1025    
1026            case OP_HSPACE:
1027            SET_BIT(0x09);
1028            SET_BIT(0x20);
1029            if (utf8)
1030              {
1031              SET_BIT(0xC2);  /* For U+00A0 */
1032              SET_BIT(0xE1);  /* For U+1680, U+180E */
1033              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1034              SET_BIT(0xE3);  /* For U+3000 */
1035              }
1036            else SET_BIT(0xA0);
1037            break;
1038    
1039            case OP_ANYNL:
1040            case OP_VSPACE:
1041            SET_BIT(0x0A);
1042            SET_BIT(0x0B);
1043            SET_BIT(0x0C);
1044            SET_BIT(0x0D);
1045            if (utf8)
1046              {
1047              SET_BIT(0xC2);  /* For U+0085 */
1048              SET_BIT(0xE2);  /* For U+2028, U+2029 */
1049              }
1050            else SET_BIT(0x85);
1051            break;
1052    
1053          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
1054          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
1055          break;          break;
1056    
1057          case OP_DIGIT:          case OP_DIGIT:
1058          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
1059          break;          break;
1060    
1061          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1062          discard it. */          ensure it gets set as not whitespace. */
1063    
1064          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1065          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1066            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
1067          break;          break;
1068    
1069          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1070          discard it. */          avoid setting it. */
1071    
1072          case OP_WHITESPACE:          case OP_WHITESPACE:
1073          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
1074            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
1075            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
1076          break;          break;
1077    
1078          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
1079          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
1080          break;          break;
1081    
1082          case OP_WORDCHAR:          case OP_WORDCHAR:
1083          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
1084          break;          break;
1085          }          }
1086    
# Line 394  do Line 1094  do
1094        character with a value > 255. */        character with a value > 255. */
1095    
1096        case OP_NCLASS:        case OP_NCLASS:
1097    #ifdef SUPPORT_UTF8
1098        if (utf8)        if (utf8)
1099          {          {
1100          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1101          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1102          }          }
1103    #endif
1104        /* Fall through */        /* Fall through */
1105    
1106        case OP_CLASS:        case OP_CLASS:
# Line 411  do Line 1113  do
1113          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
1114          characters in the range 128 - 255. */          characters in the range 128 - 255. */
1115    
1116    #ifdef SUPPORT_UTF8
1117          if (utf8)          if (utf8)
1118            {            {
1119            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 428  do Line 1131  do
1131          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1132    
1133          else          else
1134    #endif
1135            {            {
1136            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1137            }            }
1138    
1139          /* Advance past the bit map, and act on what follows */          /* Advance past the bit map, and act on what follows. For a zero
1140            minimum repeat, continue; otherwise stop processing. */
1141    
1142          tcode += 32;          tcode += 32;
1143          switch (*tcode)          switch (*tcode)
# Line 449  do Line 1154  do
1154            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1155              else try_next = FALSE;              else try_next = FALSE;
1156            break;            break;
1157    
1158            default:            default:
1159            try_next = FALSE;            try_next = FALSE;
1160            break;            break;
# Line 468  return yield; Line 1173  return yield;
1173    
1174    
1175    
1176    
1177    
1178  /*************************************************  /*************************************************
1179  *          Study a compiled expression           *  *          Study a compiled expression           *
1180  *************************************************/  *************************************************/
# Line 483  Arguments: Line 1190  Arguments:
1190              set NULL unless error              set NULL unless error
1191    
1192  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
1193                appropriate flag set;                appropriate flags set;
1194              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1195  */  */
1196    
1197  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1198  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1199  {  {
1200    int min;
1201    BOOL bits_set = FALSE;
1202  uschar start_bits[32];  uschar start_bits[32];
1203  pcre_extra *extra;  pcre_extra *extra;
1204  pcre_study_data *study;  pcre_study_data *study;
# Line 516  code = (uschar *)re + re->name_table_off Line 1225  code = (uschar *)re + re->name_table_off
1225    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1226    
1227  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
1228  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
1229  at present. */  seeking a list of starting bytes. */
1230    
1231  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
1232    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1233      {
1234      int rc;
1235    
1236      /* Set the character tables in the block that is passed around */
1237    
1238  /* Set the character tables in the block that is passed around */    tables = re->tables;
1239      if (tables == NULL)
1240        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1241        (void *)(&tables));
1242    
1243      compile_block.lcc = tables + lcc_offset;
1244      compile_block.fcc = tables + fcc_offset;
1245      compile_block.cbits = tables + cbits_offset;
1246      compile_block.ctypes = tables + ctypes_offset;
1247    
1248      /* See if we can find a fixed set of initial characters for the pattern. */
1249    
1250      memset(start_bits, 0, 32 * sizeof(uschar));
1251      rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1252        &compile_block);
1253      bits_set = rc == SSB_DONE;
1254      if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1255      }
1256    
1257    /* Find the minimum length of subject string. */
1258    
1259    switch(min = find_minlength(code, code, re->options))
1260      {
1261      case -2: *errorptr = "internal error: missing capturing bracket"; break;
1262      case -3: *errorptr = "internal error: opcode not recognized"; break;
1263      default: break;
1264      }
1265    
1266    /* Return NULL if there's been an error or if no optimization is possible. */
1267    
1268  tables = re->tables;  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
 if (tables == NULL)  
   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
1269    
1270  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1271  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 561  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 1288  extra->flags = PCRE_EXTRA_STUDY_DATA;
1288  extra->study_data = study;  extra->study_data = study;
1289    
1290  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
1291  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
1292  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
1293    if (bits_set)
1294      {
1295      study->flags |= PCRE_STUDY_MAPPED;
1296      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1297      }
1298    
1299    if (min >= 0)
1300      {
1301      study->flags |= PCRE_STUDY_MINLEN;
1302      study->minlength = min;
1303      }
1304    
1305  return extra;  return extra;
1306  }  }

Legend:
Removed from v.96  
changed lines
  Added in v.605

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12