/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 523 by ph10, Sun May 23 18:48:54 2010 UTC revision 613 by ph10, Sat Jul 2 16:59:52 2011 UTC
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
55  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56    
57    
58    
# Line 65  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code       pointer to start of group (the bracket)    code        pointer to start of group (the bracket)
70    startcode  pointer to start of the whole pattern    startcode   pointer to start of the whole pattern
71    options    the compiling options    options     the compiling options
72      had_accept  pointer to flag for (*ACCEPT) encountered
73    
74  Returns:   the minimum length  Returns:   the minimum length
75             -1 if \C was encountered             -1 if \C was encountered
76             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
77               -3 internal error (opcode not listed)
78  */  */
79    
80  static int  static int
81  find_minlength(const uschar *code, const uschar *startcode, int options)  find_minlength(const uschar *code, const uschar *startcode, int options,
82      BOOL *had_accept_ptr)
83  {  {
84  int length = -1;  int length = -1;
85  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
# Line 83  BOOL had_recurse = FALSE; Line 87  BOOL had_recurse = FALSE;
87  register int branchlength = 0;  register int branchlength = 0;
88  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
89    
90  if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;  if (*code == OP_CBRA || *code == OP_SCBRA ||
91        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
92    
93  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
94  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 117  for (;;) Line 122  for (;;)
122      case OP_SCBRA:      case OP_SCBRA:
123      case OP_BRA:      case OP_BRA:
124      case OP_SBRA:      case OP_SBRA:
125        case OP_CBRAPOS:
126        case OP_SCBRAPOS:
127        case OP_BRAPOS:
128        case OP_SBRAPOS:
129      case OP_ONCE:      case OP_ONCE:
130      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options, had_accept_ptr);
131      if (d < 0) return d;      if (d < 0) return d;
132      branchlength += d;      branchlength += d;
133        if (*had_accept_ptr) return branchlength;
134      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
135      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
136      break;      break;
137    
138      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
139      call. If it's ALT it is an alternation in a nested call. If it is      call. If it's ALT it is an alternation in a nested call. If it is END it's
140      END it's the end of the outer call. All can be handled by the same code. */      the end of the outer call. All can be handled by the same code. If it is
141        ACCEPT, it is essentially the same as END, but we set a flag so that
142        counting stops. */
143    
144        case OP_ACCEPT:
145        case OP_ASSERT_ACCEPT:
146        *had_accept_ptr = TRUE;
147        /* Fall through */
148      case OP_ALT:      case OP_ALT:
149      case OP_KET:      case OP_KET:
150      case OP_KETRMAX:      case OP_KETRMAX:
151      case OP_KETRMIN:      case OP_KETRMIN:
152        case OP_KETRPOS:
153      case OP_END:      case OP_END:
154      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
155        length = branchlength;        length = branchlength;
156      if (*cc != OP_ALT) return length;      if (op != OP_ALT) return length;
157      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
158      branchlength = 0;      branchlength = 0;
159      had_recurse = FALSE;      had_recurse = FALSE;
# Line 159  for (;;) Line 176  for (;;)
176      case OP_RREF:      case OP_RREF:
177      case OP_NRREF:      case OP_NRREF:
178      case OP_DEF:      case OP_DEF:
     case OP_OPT:  
179      case OP_CALLOUT:      case OP_CALLOUT:
180      case OP_SOD:      case OP_SOD:
181      case OP_SOM:      case OP_SOM:
182      case OP_EOD:      case OP_EOD:
183      case OP_EODN:      case OP_EODN:
184      case OP_CIRC:      case OP_CIRC:
185        case OP_CIRCM:
186      case OP_DOLL:      case OP_DOLL:
187        case OP_DOLLM:
188      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
189      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
190      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
# Line 176  for (;;) Line 194  for (;;)
194    
195      case OP_BRAZERO:      case OP_BRAZERO:
196      case OP_BRAMINZERO:      case OP_BRAMINZERO:
197        case OP_BRAPOSZERO:
198      case OP_SKIPZERO:      case OP_SKIPZERO:
199      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
200      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 185  for (;;) Line 204  for (;;)
204      /* Handle literal characters and + repetitions */      /* Handle literal characters and + repetitions */
205    
206      case OP_CHAR:      case OP_CHAR:
207      case OP_CHARNC:      case OP_CHARI:
208      case OP_NOT:      case OP_NOT:
209        case OP_NOTI:
210      case OP_PLUS:      case OP_PLUS:
211        case OP_PLUSI:
212      case OP_MINPLUS:      case OP_MINPLUS:
213        case OP_MINPLUSI:
214      case OP_POSPLUS:      case OP_POSPLUS:
215        case OP_POSPLUSI:
216      case OP_NOTPLUS:      case OP_NOTPLUS:
217        case OP_NOTPLUSI:
218      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
219        case OP_NOTMINPLUSI:
220      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
221        case OP_NOTPOSPLUSI:
222      branchlength++;      branchlength++;
223      cc += 2;      cc += 2;
224  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 211  for (;;) Line 237  for (;;)
237      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
238    
239      case OP_EXACT:      case OP_EXACT:
240        case OP_EXACTI:
241      case OP_NOTEXACT:      case OP_NOTEXACT:
242        case OP_NOTEXACTI:
243      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
244      cc += 4;      cc += 4;
245  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 248  for (;;) Line 276  for (;;)
276      cc++;      cc++;
277      break;      break;
278    
279      /* "Any newline" might match two characters */      /* "Any newline" might match two characters, but it also might match just
280        one. */
281    
282      case OP_ANYNL:      case OP_ANYNL:
283      branchlength += 2;      branchlength += 1;
284      cc++;      cc++;
285      break;      break;
286    
# Line 336  for (;;) Line 365  for (;;)
365      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
366    
367      case OP_REF:      case OP_REF:
368        case OP_REFI:
369      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
370        {        {
371        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
# Line 346  for (;;) Line 376  for (;;)
376          d = 0;          d = 0;
377          had_recurse = TRUE;          had_recurse = TRUE;
378          }          }
379        else d = find_minlength(cs, startcode, options);        else
380            {
381            d = find_minlength(cs, startcode, options, had_accept_ptr);
382            *had_accept_ptr = FALSE;
383            }
384        }        }
385      else d = 0;      else d = 0;
386      cc += 3;      cc += 3;
# Line 363  for (;;) Line 397  for (;;)
397        cc++;        cc++;
398        break;        break;
399    
400          case OP_CRPLUS:
401          case OP_CRMINPLUS:
402          min = 1;
403          cc++;
404          break;
405    
406        case OP_CRRANGE:        case OP_CRRANGE:
407        case OP_CRMINRANGE:        case OP_CRMINRANGE:
408        min = GET2(cc, 1);        min = GET2(cc, 1);
# Line 384  for (;;) Line 424  for (;;)
424      if (cc > cs && cc < ce)      if (cc > cs && cc < ce)
425        had_recurse = TRUE;        had_recurse = TRUE;
426      else      else
427        branchlength += find_minlength(cs, startcode, options);        {
428          branchlength += find_minlength(cs, startcode, options, had_accept_ptr);
429          *had_accept_ptr = FALSE;
430          }
431      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
432      break;      break;
433    
434      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
435      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
436      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. As it
437        happens, the "NOT" versions of these opcodes are used at present only for
438        ASCII characters, so they could be omitted from this list. However, in
439        future that may change, so we include them here so as not to leave a
440        gotcha for a future maintainer. */
441    
442      case OP_UPTO:      case OP_UPTO:
443        case OP_UPTOI:
444      case OP_NOTUPTO:      case OP_NOTUPTO:
445        case OP_NOTUPTOI:
446      case OP_MINUPTO:      case OP_MINUPTO:
447        case OP_MINUPTOI:
448      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
449        case OP_NOTMINUPTOI:
450      case OP_POSUPTO:      case OP_POSUPTO:
451        case OP_POSUPTOI:
452        case OP_NOTPOSUPTO:
453        case OP_NOTPOSUPTOI:
454    
455      case OP_STAR:      case OP_STAR:
456        case OP_STARI:
457        case OP_NOTSTAR:
458        case OP_NOTSTARI:
459      case OP_MINSTAR:      case OP_MINSTAR:
460        case OP_MINSTARI:
461      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
462        case OP_NOTMINSTARI:
463      case OP_POSSTAR:      case OP_POSSTAR:
464        case OP_POSSTARI:
465      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
466        case OP_NOTPOSSTARI:
467    
468      case OP_QUERY:      case OP_QUERY:
469        case OP_QUERYI:
470        case OP_NOTQUERY:
471        case OP_NOTQUERYI:
472      case OP_MINQUERY:      case OP_MINQUERY:
473        case OP_MINQUERYI:
474      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
475        case OP_NOTMINQUERYI:
476      case OP_POSQUERY:      case OP_POSQUERY:
477        case OP_POSQUERYI:
478      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
479        case OP_NOTPOSQUERYI:
480    
481      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
482  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
483      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
# Line 418  for (;;) Line 489  for (;;)
489      case OP_MARK:      case OP_MARK:
490      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
491      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
492      cc += _pcre_OP_lengths[op] + cc[1];      cc += _pcre_OP_lengths[op] + cc[1];
493      break;      break;
494    
495      /* For the record, these are the opcodes that are matched by "default":      case OP_THEN_ARG:
496      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
497      OP_THEN. */      break;
498    
499        /* The remaining opcodes are just skipped over. */
500    
501      default:      case OP_CLOSE:
502        case OP_COMMIT:
503        case OP_FAIL:
504        case OP_PRUNE:
505        case OP_SET_SOM:
506        case OP_SKIP:
507        case OP_THEN:
508      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
509      break;      break;
510    
511        /* This should not occur: we list all opcodes explicitly so that when
512        new ones get added they are properly considered. */
513    
514        default:
515        return -3;
516      }      }
517    }    }
518  /* Control never gets here */  /* Control never gets here */
# Line 440  for (;;) Line 524  for (;;)
524  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
525  *************************************************/  *************************************************/
526    
527  /* Given a character, set its first byte's bit in the table, and also the  /* Given a character, set its first byte's bit in the table, and also the
528  corresponding bit for the other version of a letter if we are caseless. In  corresponding bit for the other version of a letter if we are caseless. In
529  UTF-8 mode, for characters greater than 127, we can only do the caseless thing  UTF-8 mode, for characters greater than 127, we can only do the caseless thing
530  when Unicode property support is available.  when Unicode property support is available.
# Line 450  Arguments: Line 534  Arguments:
534    p             points to the character    p             points to the character
535    caseless      the caseless flag    caseless      the caseless flag
536    cd            the block with char table pointers    cd            the block with char table pointers
537    utf8          TRUE for UTF-8 mode    utf8          TRUE for UTF-8 mode
538    
539  Returns:        pointer after the character  Returns:        pointer after the character
540  */  */
# Line 460  set_table_bit(uschar *start_bits, const Line 544  set_table_bit(uschar *start_bits, const
544    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf8)
545  {  {
546  unsigned int c = *p;  unsigned int c = *p;
547  start_bits[c/8] |= (1 << (c&7));  
548    SET_BIT(c);
549    
550  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
551  if (utf8 && c > 127)  if (utf8 && c > 127)
# Line 469  if (utf8 && c > 127) Line 554  if (utf8 && c > 127)
554  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
555    if (caseless)    if (caseless)
556      {      {
557      uschar buff[8];      uschar buff[8];
558      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
559      (void)_pcre_ord2utf8(c, buff);      (void)_pcre_ord2utf8(c, buff);
560      c = buff[0];      SET_BIT(buff[0]);
561      start_bits[c/8] |= (1 << (c&7));      }
562      }  #endif
 #endif  
563    return p;    return p;
564    }    }
565  #endif  #endif
566    
567  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
568    
569  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  
570  return p + 1;  return p + 1;
571  }  }
572    
573    
574    
575  /*************************************************  /*************************************************
576    *     Set bits for a positive character type     *
577    *************************************************/
578    
579    /* This function sets starting bits for a character type. In UTF-8 mode, we can
580    only do a direct setting for bytes less than 128, as otherwise there can be
581    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
582    environment, the tables will only recognize ASCII characters anyway, but in at
583    least one Windows environment, some higher bytes bits were set in the tables.
584    So we deal with that case by considering the UTF-8 encoding.
585    
586    Arguments:
587      start_bits     the starting bitmap
588      cbit type      the type of character wanted
589      table_limit    32 for non-UTF-8; 16 for UTF-8
590      cd             the block with char table pointers
591    
592    Returns:         nothing
593    */
594    
595    static void
596    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
597      compile_data *cd)
598    {
599    register int c;
600    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
601    if (table_limit == 32) return;
602    for (c = 128; c < 256; c++)
603      {
604      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
605        {
606        uschar buff[8];
607        (void)_pcre_ord2utf8(c, buff);
608        SET_BIT(buff[0]);
609        }
610      }
611    }
612    
613    
614    /*************************************************
615    *     Set bits for a negative character type     *
616    *************************************************/
617    
618    /* This function sets starting bits for a negative character type such as \D.
619    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
620    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
621    Unlike in the positive case, where we can set appropriate starting bits for
622    specific high-valued UTF-8 characters, in this case we have to set the bits for
623    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
624    0xc0 (192) for simplicity.
625    
626    Arguments:
627      start_bits     the starting bitmap
628      cbit type      the type of character wanted
629      table_limit    32 for non-UTF-8; 16 for UTF-8
630      cd             the block with char table pointers
631    
632    Returns:         nothing
633    */
634    
635    static void
636    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
637      compile_data *cd)
638    {
639    register int c;
640    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
641    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
642    }
643    
644    
645    
646    /*************************************************
647  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
648  *************************************************/  *************************************************/
649    
# Line 504  function fails unless the result is SSB_ Line 658  function fails unless the result is SSB_
658  Arguments:  Arguments:
659    code         points to an expression    code         points to an expression
660    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
   caseless     the current state of the caseless flag  
661    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
662    cd           the block with char table pointers    cd           the block with char table pointers
663    
664  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
665                 SSB_DONE     => Found mandatory starting bytes                 SSB_DONE     => Found mandatory starting bytes
666                 SSB_CONTINUE => Found optional starting bytes                 SSB_CONTINUE => Found optional starting bytes
667                   SSB_UNKNOWN  => Hit an unrecognized opcode
668  */  */
669    
670  static int  static int
671  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
672    BOOL utf8, compile_data *cd)    compile_data *cd)
673  {  {
674  register int c;  register int c;
675  int yield = SSB_DONE;  int yield = SSB_DONE;
676    int table_limit = utf8? 16:32;
677    
678  #if 0  #if 0
679  /* ========================================================================= */  /* ========================================================================= */
# Line 539  volatile int dummy; Line 694  volatile int dummy;
694    
695  do  do
696    {    {
   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;  
697    BOOL try_next = TRUE;    BOOL try_next = TRUE;
698      const uschar *tcode = code + 1 + LINK_SIZE;
699    
700      if (*code == OP_CBRA || *code == OP_SCBRA ||
701          *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
702    
703    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
704      {      {
705      int rc;      int rc;
706    
707      switch(*tcode)      switch(*tcode)
708        {        {
709        /* Fail if we reach something we don't understand */        /* If we reach something we don't understand, it means a new opcode has
710          been created that hasn't been added to this code. Hopefully this problem
711          will be discovered during testing. */
712    
713        default:        default:
714          return SSB_UNKNOWN;
715    
716          /* Fail for a valid opcode that implies no starting bits. */
717    
718          case OP_ACCEPT:
719          case OP_ASSERT_ACCEPT:
720          case OP_ALLANY:
721          case OP_ANY:
722          case OP_ANYBYTE:
723          case OP_CIRC:
724          case OP_CIRCM:
725          case OP_CLOSE:
726          case OP_COMMIT:
727          case OP_COND:
728          case OP_CREF:
729          case OP_DEF:
730          case OP_DOLL:
731          case OP_DOLLM:
732          case OP_END:
733          case OP_EOD:
734          case OP_EODN:
735          case OP_EXTUNI:
736          case OP_FAIL:
737          case OP_MARK:
738          case OP_NCREF:
739          case OP_NOT:
740          case OP_NOTEXACT:
741          case OP_NOTEXACTI:
742          case OP_NOTI:
743          case OP_NOTMINPLUS:
744          case OP_NOTMINPLUSI:
745          case OP_NOTMINQUERY:
746          case OP_NOTMINQUERYI:
747          case OP_NOTMINSTAR:
748          case OP_NOTMINSTARI:
749          case OP_NOTMINUPTO:
750          case OP_NOTMINUPTOI:
751          case OP_NOTPLUS:
752          case OP_NOTPLUSI:
753          case OP_NOTPOSPLUS:
754          case OP_NOTPOSPLUSI:
755          case OP_NOTPOSQUERY:
756          case OP_NOTPOSQUERYI:
757          case OP_NOTPOSSTAR:
758          case OP_NOTPOSSTARI:
759          case OP_NOTPOSUPTO:
760          case OP_NOTPOSUPTOI:
761          case OP_NOTPROP:
762          case OP_NOTQUERY:
763          case OP_NOTQUERYI:
764          case OP_NOTSTAR:
765          case OP_NOTSTARI:
766          case OP_NOTUPTO:
767          case OP_NOTUPTOI:
768          case OP_NOT_HSPACE:
769          case OP_NOT_VSPACE:
770          case OP_NOT_WORD_BOUNDARY:
771          case OP_NRREF:
772          case OP_PROP:
773          case OP_PRUNE:
774          case OP_PRUNE_ARG:
775          case OP_RECURSE:
776          case OP_REF:
777          case OP_REFI:
778          case OP_REVERSE:
779          case OP_RREF:
780          case OP_SCOND:
781          case OP_SET_SOM:
782          case OP_SKIP:
783          case OP_SKIP_ARG:
784          case OP_SOD:
785          case OP_SOM:
786          case OP_THEN:
787          case OP_THEN_ARG:
788          case OP_WORD_BOUNDARY:
789          case OP_XCLASS:
790        return SSB_FAIL;        return SSB_FAIL;
791    
792        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
# Line 561  do Line 798  do
798        case OP_SBRA:        case OP_SBRA:
799        case OP_CBRA:        case OP_CBRA:
800        case OP_SCBRA:        case OP_SCBRA:
801          case OP_BRAPOS:
802          case OP_SBRAPOS:
803          case OP_CBRAPOS:
804          case OP_SCBRAPOS:
805        case OP_ONCE:        case OP_ONCE:
806        case OP_ASSERT:        case OP_ASSERT:
807        rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf8, cd);
808        if (rc == SSB_FAIL) return SSB_FAIL;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
809        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
810          {          {
811          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
# Line 587  do Line 828  do
828        case OP_KET:        case OP_KET:
829        case OP_KETRMAX:        case OP_KETRMAX:
830        case OP_KETRMIN:        case OP_KETRMIN:
831          case OP_KETRPOS:
832        return SSB_CONTINUE;        return SSB_CONTINUE;
833    
834        /* Skip over callout */        /* Skip over callout */
# Line 604  do Line 846  do
846        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
847        break;        break;
848    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
849        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
850    
851        case OP_BRAZERO:        case OP_BRAZERO:
852        case OP_BRAMINZERO:        case OP_BRAMINZERO:
853        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)        case OP_BRAPOSZERO:
854          return SSB_FAIL;        rc = set_start_bits(++tcode, start_bits, utf8, cd);
855          if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
856  /* =========================================================================  /* =========================================================================
857        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
858        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
# Line 642  do Line 878  do
878        case OP_QUERY:        case OP_QUERY:
879        case OP_MINQUERY:        case OP_MINQUERY:
880        case OP_POSQUERY:        case OP_POSQUERY:
881        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
882          break;
883    
884          case OP_STARI:
885          case OP_MINSTARI:
886          case OP_POSSTARI:
887          case OP_QUERYI:
888          case OP_MINQUERYI:
889          case OP_POSQUERYI:
890          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
891        break;        break;
892    
893        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 650  do Line 895  do
895        case OP_UPTO:        case OP_UPTO:
896        case OP_MINUPTO:        case OP_MINUPTO:
897        case OP_POSUPTO:        case OP_POSUPTO:
898        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
899          break;
900    
901          case OP_UPTOI:
902          case OP_MINUPTOI:
903          case OP_POSUPTOI:
904          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
905        break;        break;
906    
907        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
908    
909        case OP_EXACT:       /* Fall through */        case OP_EXACT:
910        tcode += 2;        tcode += 2;
911          /* Fall through */
912        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
913        case OP_PLUS:        case OP_PLUS:
914        case OP_MINPLUS:        case OP_MINPLUS:
915        case OP_POSPLUS:        case OP_POSPLUS:
916        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
917          try_next = FALSE;
918          break;
919    
920          case OP_EXACTI:
921          tcode += 2;
922          /* Fall through */
923          case OP_CHARI:
924          case OP_PLUSI:
925          case OP_MINPLUSI:
926          case OP_POSPLUSI:
927          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
928          try_next = FALSE;
929          break;
930    
931          /* Special spacing and line-terminating items. These recognize specific
932          lists of characters. The difference between VSPACE and ANYNL is that the
933          latter can match the two-character CRLF sequence, but that is not
934          relevant for finding the first character, so their code here is
935          identical. */
936    
937          case OP_HSPACE:
938          SET_BIT(0x09);
939          SET_BIT(0x20);
940          if (utf8)
941            {
942            SET_BIT(0xC2);  /* For U+00A0 */
943            SET_BIT(0xE1);  /* For U+1680, U+180E */
944            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
945            SET_BIT(0xE3);  /* For U+3000 */
946            }
947          else SET_BIT(0xA0);
948          try_next = FALSE;
949          break;
950    
951          case OP_ANYNL:
952          case OP_VSPACE:
953          SET_BIT(0x0A);
954          SET_BIT(0x0B);
955          SET_BIT(0x0C);
956          SET_BIT(0x0D);
957          if (utf8)
958            {
959            SET_BIT(0xC2);  /* For U+0085 */
960            SET_BIT(0xE2);  /* For U+2028, U+2029 */
961            }
962          else SET_BIT(0x85);
963        try_next = FALSE;        try_next = FALSE;
964        break;        break;
965    
966        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
967        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
968        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
969        are recognized to match the types. */        than 256 are recognized to match the types. */
970    
971        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
972        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
973        try_next = FALSE;        try_next = FALSE;
974        break;        break;
975    
976        case OP_DIGIT:        case OP_DIGIT:
977        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
978        try_next = FALSE;        try_next = FALSE;
979        break;        break;
980    
981        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
982        discard it. */        ensure it is set as not whitespace. */
983    
984        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
985        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
986          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
987        try_next = FALSE;        try_next = FALSE;
988        break;        break;
989    
990        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
991        discard it. */        not set it from the table. */
992    
993        case OP_WHITESPACE:        case OP_WHITESPACE:
994        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
995          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
996          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
997        try_next = FALSE;        try_next = FALSE;
998        break;        break;
999    
1000        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
1001        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
1002        try_next = FALSE;        try_next = FALSE;
1003        break;        break;
1004    
1005        case OP_WORDCHAR:        case OP_WORDCHAR:
1006        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
1007        try_next = FALSE;        try_next = FALSE;
1008        break;        break;
1009    
# Line 727  do Line 1012  do
1012    
1013        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1014        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
1015          case OP_TYPEPOSPLUS:
1016        tcode++;        tcode++;
1017        break;        break;
1018    
# Line 755  do Line 1041  do
1041          case OP_ALLANY:          case OP_ALLANY:
1042          return SSB_FAIL;          return SSB_FAIL;
1043    
1044            case OP_HSPACE:
1045            SET_BIT(0x09);
1046            SET_BIT(0x20);
1047            if (utf8)
1048              {
1049              SET_BIT(0xC2);  /* For U+00A0 */
1050              SET_BIT(0xE1);  /* For U+1680, U+180E */
1051              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1052              SET_BIT(0xE3);  /* For U+3000 */
1053              }
1054            else SET_BIT(0xA0);
1055            break;
1056    
1057            case OP_ANYNL:
1058            case OP_VSPACE:
1059            SET_BIT(0x0A);
1060            SET_BIT(0x0B);
1061            SET_BIT(0x0C);
1062            SET_BIT(0x0D);
1063            if (utf8)
1064              {
1065              SET_BIT(0xC2);  /* For U+0085 */
1066              SET_BIT(0xE2);  /* For U+2028, U+2029 */
1067              }
1068            else SET_BIT(0x85);
1069            break;
1070    
1071          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
1072          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
1073          break;          break;
1074    
1075          case OP_DIGIT:          case OP_DIGIT:
1076          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
1077          break;          break;
1078    
1079          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1080          discard it. */          ensure it gets set as not whitespace. */
1081    
1082          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1083          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1084            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
1085          break;          break;
1086    
1087          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1088          discard it. */          avoid setting it. */
1089    
1090          case OP_WHITESPACE:          case OP_WHITESPACE:
1091          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
1092            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
1093            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
1094          break;          break;
1095    
1096          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
1097          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
1098          break;          break;
1099    
1100          case OP_WORDCHAR:          case OP_WORDCHAR:
1101          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
1102          break;          break;
1103          }          }
1104    
# Line 852  do Line 1154  do
1154            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1155            }            }
1156    
1157          /* Advance past the bit map, and act on what follows */          /* Advance past the bit map, and act on what follows. For a zero
1158            minimum repeat, continue; otherwise stop processing. */
1159    
1160          tcode += 32;          tcode += 32;
1161          switch (*tcode)          switch (*tcode)
# Line 869  do Line 1172  do
1172            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1173              else try_next = FALSE;              else try_next = FALSE;
1174            break;            break;
1175    
1176            default:            default:
1177            try_next = FALSE;            try_next = FALSE;
1178            break;            break;
# Line 888  return yield; Line 1191  return yield;
1191    
1192    
1193    
1194    
1195    
1196  /*************************************************  /*************************************************
1197  *          Study a compiled expression           *  *          Study a compiled expression           *
1198  *************************************************/  *************************************************/
# Line 912  pcre_study(const pcre *external_re, int Line 1217  pcre_study(const pcre *external_re, int
1217  {  {
1218  int min;  int min;
1219  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1220    BOOL had_accept = FALSE;
1221  uschar start_bits[32];  uschar start_bits[32];
1222  pcre_extra *extra;  pcre_extra *extra;
1223  pcre_study_data *study;  pcre_study_data *study;
# Line 944  seeking a list of starting bytes. */ Line 1250  seeking a list of starting bytes. */
1250  if ((re->options & PCRE_ANCHORED) == 0 &&  if ((re->options & PCRE_ANCHORED) == 0 &&
1251      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1252    {    {
1253      int rc;
1254    
1255    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1256    
1257    tables = re->tables;    tables = re->tables;
# Line 959  if ((re->options & PCRE_ANCHORED) == 0 & Line 1267  if ((re->options & PCRE_ANCHORED) == 0 &
1267    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1268    
1269    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
1270    bits_set = set_start_bits(code, start_bits,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1271      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,      &compile_block);
1272      &compile_block) == SSB_DONE;    bits_set = rc == SSB_DONE;
1273      if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1274    }    }
1275    
1276  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1277    
1278  min = find_minlength(code, code, re->options);  switch(min = find_minlength(code, code, re->options, &had_accept))
1279      {
1280      case -2: *errorptr = "internal error: missing capturing bracket"; break;
1281      case -3: *errorptr = "internal error: opcode not recognized"; break;
1282      default: break;
1283      }
1284    
1285  /* Return NULL if no optimization is possible. */  /* Return NULL if there's been an error or if no optimization is possible. */
1286    
1287  if (!bits_set && min < 0) return NULL;  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
1288    
1289  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1290  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.523  
changed lines
  Added in v.613

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12