/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 520 by ph10, Sat May 22 18:54:05 2010 UTC revision 604 by ph10, Thu Jun 2 19:04:54 2011 UTC
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 72  Arguments: Line 73  Arguments:
73  Returns:   the minimum length  Returns:   the minimum length
74             -1 if \C was encountered             -1 if \C was encountered
75             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
76               -3 internal error (opcode not listed)
77  */  */
78    
79  static int  static int
# Line 83  BOOL had_recurse = FALSE; Line 85  BOOL had_recurse = FALSE;
85  register int branchlength = 0;  register int branchlength = 0;
86  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;  register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87    
88  if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;  if (*code == OP_CBRA || *code == OP_SCBRA ||
89        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
90    
91  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
92  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 117  for (;;) Line 120  for (;;)
120      case OP_SCBRA:      case OP_SCBRA:
121      case OP_BRA:      case OP_BRA:
122      case OP_SBRA:      case OP_SBRA:
123        case OP_CBRAPOS:
124        case OP_SCBRAPOS:
125        case OP_BRAPOS:
126        case OP_SBRAPOS:
127      case OP_ONCE:      case OP_ONCE:
128      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options);
129      if (d < 0) return d;      if (d < 0) return d;
# Line 133  for (;;) Line 140  for (;;)
140      case OP_KET:      case OP_KET:
141      case OP_KETRMAX:      case OP_KETRMAX:
142      case OP_KETRMIN:      case OP_KETRMIN:
143        case OP_KETRPOS:
144      case OP_END:      case OP_END:
145      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
146        length = branchlength;        length = branchlength;
# Line 159  for (;;) Line 167  for (;;)
167      case OP_RREF:      case OP_RREF:
168      case OP_NRREF:      case OP_NRREF:
169      case OP_DEF:      case OP_DEF:
     case OP_OPT:  
170      case OP_CALLOUT:      case OP_CALLOUT:
171      case OP_SOD:      case OP_SOD:
172      case OP_SOM:      case OP_SOM:
173      case OP_EOD:      case OP_EOD:
174      case OP_EODN:      case OP_EODN:
175      case OP_CIRC:      case OP_CIRC:
176        case OP_CIRCM:
177      case OP_DOLL:      case OP_DOLL:
178        case OP_DOLLM:
179      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
180      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
181      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
# Line 176  for (;;) Line 185  for (;;)
185    
186      case OP_BRAZERO:      case OP_BRAZERO:
187      case OP_BRAMINZERO:      case OP_BRAMINZERO:
188        case OP_BRAPOSZERO:
189      case OP_SKIPZERO:      case OP_SKIPZERO:
190      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
191      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 185  for (;;) Line 195  for (;;)
195      /* Handle literal characters and + repetitions */      /* Handle literal characters and + repetitions */
196    
197      case OP_CHAR:      case OP_CHAR:
198      case OP_CHARNC:      case OP_CHARI:
199      case OP_NOT:      case OP_NOT:
200        case OP_NOTI:
201      case OP_PLUS:      case OP_PLUS:
202        case OP_PLUSI:
203      case OP_MINPLUS:      case OP_MINPLUS:
204        case OP_MINPLUSI:
205      case OP_POSPLUS:      case OP_POSPLUS:
206        case OP_POSPLUSI:
207      case OP_NOTPLUS:      case OP_NOTPLUS:
208        case OP_NOTPLUSI:
209      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
210        case OP_NOTMINPLUSI:
211      case OP_NOTPOSPLUS:      case OP_NOTPOSPLUS:
212        case OP_NOTPOSPLUSI:
213      branchlength++;      branchlength++;
214      cc += 2;      cc += 2;
215  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 211  for (;;) Line 228  for (;;)
228      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
229    
230      case OP_EXACT:      case OP_EXACT:
231        case OP_EXACTI:
232      case OP_NOTEXACT:      case OP_NOTEXACT:
233        case OP_NOTEXACTI:
234      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
235      cc += 4;      cc += 4;
236  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 336  for (;;) Line 355  for (;;)
355      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
356    
357      case OP_REF:      case OP_REF:
358        case OP_REFI:
359      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
360        {        {
361        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
# Line 362  for (;;) Line 382  for (;;)
382        min = 0;        min = 0;
383        cc++;        cc++;
384        break;        break;
385    
386          case OP_CRPLUS:
387          case OP_CRMINPLUS:
388          min = 1;
389          cc++;
390          break;
391    
392        case OP_CRRANGE:        case OP_CRRANGE:
393        case OP_CRMINRANGE:        case OP_CRMINRANGE:
394        min = GET2(cc, 1);        min = GET2(cc, 1);
# Line 390  for (;;) Line 416  for (;;)
416    
417      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
418      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
419      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. As it
420        happens, the "NOT" versions of these opcodes are used at present only for
421        ASCII characters, so they could be omitted from this list. However, in
422        future that may change, so we include them here so as not to leave a
423        gotcha for a future maintainer. */
424    
425      case OP_UPTO:      case OP_UPTO:
426        case OP_UPTOI:
427      case OP_NOTUPTO:      case OP_NOTUPTO:
428        case OP_NOTUPTOI:
429      case OP_MINUPTO:      case OP_MINUPTO:
430        case OP_MINUPTOI:
431      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
432        case OP_NOTMINUPTOI:
433      case OP_POSUPTO:      case OP_POSUPTO:
434        case OP_POSUPTOI:
435        case OP_NOTPOSUPTO:
436        case OP_NOTPOSUPTOI:
437    
438      case OP_STAR:      case OP_STAR:
439        case OP_STARI:
440        case OP_NOTSTAR:
441        case OP_NOTSTARI:
442      case OP_MINSTAR:      case OP_MINSTAR:
443        case OP_MINSTARI:
444      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
445        case OP_NOTMINSTARI:
446      case OP_POSSTAR:      case OP_POSSTAR:
447        case OP_POSSTARI:
448      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
449        case OP_NOTPOSSTARI:
450    
451      case OP_QUERY:      case OP_QUERY:
452        case OP_QUERYI:
453        case OP_NOTQUERY:
454        case OP_NOTQUERYI:
455      case OP_MINQUERY:      case OP_MINQUERY:
456        case OP_MINQUERYI:
457      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
458        case OP_NOTMINQUERYI:
459      case OP_POSQUERY:      case OP_POSQUERY:
460        case OP_POSQUERYI:
461      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
462        case OP_NOTPOSQUERYI:
463    
464      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
465  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
466      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
# Line 418  for (;;) Line 472  for (;;)
472      case OP_MARK:      case OP_MARK:
473      case OP_PRUNE_ARG:      case OP_PRUNE_ARG:
474      case OP_SKIP_ARG:      case OP_SKIP_ARG:
     case OP_THEN_ARG:  
475      cc += _pcre_OP_lengths[op] + cc[1];      cc += _pcre_OP_lengths[op] + cc[1];
476      break;      break;
477    
478      /* For the record, these are the opcodes that are matched by "default":      case OP_THEN_ARG:
479      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
480      OP_THEN. */      break;
481    
482        /* The remaining opcodes are just skipped over. */
483    
484      default:      case OP_ACCEPT:
485        case OP_CLOSE:
486        case OP_COMMIT:
487        case OP_FAIL:
488        case OP_PRUNE:
489        case OP_SET_SOM:
490        case OP_SKIP:
491        case OP_THEN:
492      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
493      break;      break;
494    
495        /* This should not occur: we list all opcodes explicitly so that when
496        new ones get added they are properly considered. */
497    
498        default:
499        return -3;
500      }      }
501    }    }
502  /* Control never gets here */  /* Control never gets here */
# Line 440  for (;;) Line 508  for (;;)
508  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
509  *************************************************/  *************************************************/
510    
511  /* Given a character, set its first byte's bit in the table, and also the  /* Given a character, set its first byte's bit in the table, and also the
512  corresponding bit for the other version of a letter if we are caseless. In  corresponding bit for the other version of a letter if we are caseless. In
513  UTF-8 mode, for characters greater than 127, we can only do the caseless thing  UTF-8 mode, for characters greater than 127, we can only do the caseless thing
514  when Unicode property support is available.  when Unicode property support is available.
# Line 450  Arguments: Line 518  Arguments:
518    p             points to the character    p             points to the character
519    caseless      the caseless flag    caseless      the caseless flag
520    cd            the block with char table pointers    cd            the block with char table pointers
521    utf8          TRUE for UTF-8 mode    utf8          TRUE for UTF-8 mode
522    
523  Returns:        pointer after the character  Returns:        pointer after the character
524  */  */
# Line 460  set_table_bit(uschar *start_bits, const Line 528  set_table_bit(uschar *start_bits, const
528    compile_data *cd, BOOL utf8)    compile_data *cd, BOOL utf8)
529  {  {
530  unsigned int c = *p;  unsigned int c = *p;
531  start_bits[c/8] |= (1 << (c&7));  
532    SET_BIT(c);
533    
534  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
535  if (utf8 && c > 127)  if (utf8 && c > 127)
# Line 469  if (utf8 && c > 127) Line 538  if (utf8 && c > 127)
538  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
539    if (caseless)    if (caseless)
540      {      {
541      uschar buff[8];      uschar buff[8];
542      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
543      (void)_pcre_ord2utf8(c, buff);      (void)_pcre_ord2utf8(c, buff);
544      c = buff[0];      SET_BIT(buff[0]);
545      start_bits[c/8] |= (1 << (c&7));      }
546      }  #endif
 #endif  
547    return p;    return p;
548    }    }
549  #endif  #endif
550    
551  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
552    
553  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  
554  return p + 1;  return p + 1;
555  }  }
556    
557    
558    
559  /*************************************************  /*************************************************
560    *     Set bits for a positive character type     *
561    *************************************************/
562    
563    /* This function sets starting bits for a character type. In UTF-8 mode, we can
564    only do a direct setting for bytes less than 128, as otherwise there can be
565    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
566    environment, the tables will only recognize ASCII characters anyway, but in at
567    least one Windows environment, some higher bytes bits were set in the tables.
568    So we deal with that case by considering the UTF-8 encoding.
569    
570    Arguments:
571      start_bits     the starting bitmap
572      cbit type      the type of character wanted
573      table_limit    32 for non-UTF-8; 16 for UTF-8
574      cd             the block with char table pointers
575    
576    Returns:         nothing
577    */
578    
579    static void
580    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
581      compile_data *cd)
582    {
583    register int c;
584    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
585    if (table_limit == 32) return;
586    for (c = 128; c < 256; c++)
587      {
588      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
589        {
590        uschar buff[8];
591        (void)_pcre_ord2utf8(c, buff);
592        SET_BIT(buff[0]);
593        }
594      }
595    }
596    
597    
598    /*************************************************
599    *     Set bits for a negative character type     *
600    *************************************************/
601    
602    /* This function sets starting bits for a negative character type such as \D.
603    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
604    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
605    Unlike in the positive case, where we can set appropriate starting bits for
606    specific high-valued UTF-8 characters, in this case we have to set the bits for
607    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
608    0xc0 (192) for simplicity.
609    
610    Arguments:
611      start_bits     the starting bitmap
612      cbit type      the type of character wanted
613      table_limit    32 for non-UTF-8; 16 for UTF-8
614      cd             the block with char table pointers
615    
616    Returns:         nothing
617    */
618    
619    static void
620    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
621      compile_data *cd)
622    {
623    register int c;
624    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
625    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
626    }
627    
628    
629    
630    /*************************************************
631  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
632  *************************************************/  *************************************************/
633    
# Line 504  function fails unless the result is SSB_ Line 642  function fails unless the result is SSB_
642  Arguments:  Arguments:
643    code         points to an expression    code         points to an expression
644    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
   caseless     the current state of the caseless flag  
645    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
646    cd           the block with char table pointers    cd           the block with char table pointers
647    
# Line 514  Returns: SSB_FAIL => Failed to Line 651  Returns: SSB_FAIL => Failed to
651  */  */
652    
653  static int  static int
654  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
655    BOOL utf8, compile_data *cd)    compile_data *cd)
656  {  {
657  register int c;  register int c;
658  int yield = SSB_DONE;  int yield = SSB_DONE;
659    int table_limit = utf8? 16:32;
660    
661  #if 0  #if 0
662  /* ========================================================================= */  /* ========================================================================= */
# Line 539  volatile int dummy; Line 677  volatile int dummy;
677    
678  do  do
679    {    {
   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;  
680    BOOL try_next = TRUE;    BOOL try_next = TRUE;
681      const uschar *tcode = code + 1 + LINK_SIZE;
682    
683      if (*code == OP_CBRA || *code == OP_SCBRA ||
684          *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
685    
686    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
687      {      {
# Line 561  do Line 702  do
702        case OP_SBRA:        case OP_SBRA:
703        case OP_CBRA:        case OP_CBRA:
704        case OP_SCBRA:        case OP_SCBRA:
705          case OP_BRAPOS:
706          case OP_SBRAPOS:
707          case OP_CBRAPOS:
708          case OP_SCBRAPOS:
709        case OP_ONCE:        case OP_ONCE:
710        case OP_ASSERT:        case OP_ASSERT:
711        rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf8, cd);
712        if (rc == SSB_FAIL) return SSB_FAIL;        if (rc == SSB_FAIL) return SSB_FAIL;
713        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
714          {          {
# Line 587  do Line 732  do
732        case OP_KET:        case OP_KET:
733        case OP_KETRMAX:        case OP_KETRMAX:
734        case OP_KETRMIN:        case OP_KETRMIN:
735          case OP_KETRPOS:
736        return SSB_CONTINUE;        return SSB_CONTINUE;
737    
738        /* Skip over callout */        /* Skip over callout */
# Line 604  do Line 750  do
750        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
751        break;        break;
752    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
753        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
754    
755        case OP_BRAZERO:        case OP_BRAZERO:
756        case OP_BRAMINZERO:        case OP_BRAMINZERO:
757        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)        case OP_BRAPOSZERO:
758          if (set_start_bits(++tcode, start_bits, utf8, cd) == SSB_FAIL)
759          return SSB_FAIL;          return SSB_FAIL;
760  /* =========================================================================  /* =========================================================================
761        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
# Line 642  do Line 782  do
782        case OP_QUERY:        case OP_QUERY:
783        case OP_MINQUERY:        case OP_MINQUERY:
784        case OP_POSQUERY:        case OP_POSQUERY:
785        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
786          break;
787    
788          case OP_STARI:
789          case OP_MINSTARI:
790          case OP_POSSTARI:
791          case OP_QUERYI:
792          case OP_MINQUERYI:
793          case OP_POSQUERYI:
794          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
795        break;        break;
796    
797        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 650  do Line 799  do
799        case OP_UPTO:        case OP_UPTO:
800        case OP_MINUPTO:        case OP_MINUPTO:
801        case OP_POSUPTO:        case OP_POSUPTO:
802        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
803          break;
804    
805          case OP_UPTOI:
806          case OP_MINUPTOI:
807          case OP_POSUPTOI:
808          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
809        break;        break;
810    
811        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
812    
813        case OP_EXACT:       /* Fall through */        case OP_EXACT:
814        tcode += 2;        tcode += 2;
815          /* Fall through */
816        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
817        case OP_PLUS:        case OP_PLUS:
818        case OP_MINPLUS:        case OP_MINPLUS:
819        case OP_POSPLUS:        case OP_POSPLUS:
820        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
821          try_next = FALSE;
822          break;
823    
824          case OP_EXACTI:
825          tcode += 2;
826          /* Fall through */
827          case OP_CHARI:
828          case OP_PLUSI:
829          case OP_MINPLUSI:
830          case OP_POSPLUSI:
831          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
832          try_next = FALSE;
833          break;
834    
835          /* Special spacing and line-terminating items. These recognize specific
836          lists of characters. The difference between VSPACE and ANYNL is that the
837          latter can match the two-character CRLF sequence, but that is not
838          relevant for finding the first character, so their code here is
839          identical. */
840    
841          case OP_HSPACE:
842          SET_BIT(0x09);
843          SET_BIT(0x20);
844          if (utf8)
845            {
846            SET_BIT(0xC2);  /* For U+00A0 */
847            SET_BIT(0xE1);  /* For U+1680, U+180E */
848            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
849            SET_BIT(0xE3);  /* For U+3000 */
850            }
851          else SET_BIT(0xA0);
852          try_next = FALSE;
853          break;
854    
855          case OP_ANYNL:
856          case OP_VSPACE:
857          SET_BIT(0x0A);
858          SET_BIT(0x0B);
859          SET_BIT(0x0C);
860          SET_BIT(0x0D);
861          if (utf8)
862            {
863            SET_BIT(0xC2);  /* For U+0085 */
864            SET_BIT(0xE2);  /* For U+2028, U+2029 */
865            }
866          else SET_BIT(0x85);
867        try_next = FALSE;        try_next = FALSE;
868        break;        break;
869    
870        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
871        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
872        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
873        are recognized to match the types. */        than 256 are recognized to match the types. */
874    
875        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
876        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
877        try_next = FALSE;        try_next = FALSE;
878        break;        break;
879    
880        case OP_DIGIT:        case OP_DIGIT:
881        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
882        try_next = FALSE;        try_next = FALSE;
883        break;        break;
884    
885        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
886        discard it. */        ensure it is set as not whitespace. */
887    
888        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
889        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
890          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
891        try_next = FALSE;        try_next = FALSE;
892        break;        break;
893    
894        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
895        discard it. */        not set it from the table. */
896    
897        case OP_WHITESPACE:        case OP_WHITESPACE:
898        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
899          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
900          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
901        try_next = FALSE;        try_next = FALSE;
902        break;        break;
903    
904        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
905        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
906        try_next = FALSE;        try_next = FALSE;
907        break;        break;
908    
909        case OP_WORDCHAR:        case OP_WORDCHAR:
910        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
911        try_next = FALSE;        try_next = FALSE;
912        break;        break;
913    
# Line 727  do Line 916  do
916    
917        case OP_TYPEPLUS:        case OP_TYPEPLUS:
918        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
919          case OP_TYPEPOSPLUS:
920        tcode++;        tcode++;
921        break;        break;
922    
# Line 750  do Line 940  do
940        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
941        switch(tcode[1])        switch(tcode[1])
942          {          {
943            default:
944          case OP_ANY:          case OP_ANY:
945          case OP_ALLANY:          case OP_ALLANY:
946          return SSB_FAIL;          return SSB_FAIL;
947    
948            case OP_HSPACE:
949            SET_BIT(0x09);
950            SET_BIT(0x20);
951            if (utf8)
952              {
953              SET_BIT(0xC2);  /* For U+00A0 */
954              SET_BIT(0xE1);  /* For U+1680, U+180E */
955              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
956              SET_BIT(0xE3);  /* For U+3000 */
957              }
958            else SET_BIT(0xA0);
959            break;
960    
961            case OP_ANYNL:
962            case OP_VSPACE:
963            SET_BIT(0x0A);
964            SET_BIT(0x0B);
965            SET_BIT(0x0C);
966            SET_BIT(0x0D);
967            if (utf8)
968              {
969              SET_BIT(0xC2);  /* For U+0085 */
970              SET_BIT(0xE2);  /* For U+2028, U+2029 */
971              }
972            else SET_BIT(0x85);
973            break;
974    
975          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
976          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
977          break;          break;
978    
979          case OP_DIGIT:          case OP_DIGIT:
980          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
981          break;          break;
982    
983          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
984          discard it. */          ensure it gets set as not whitespace. */
985    
986          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
987          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
988            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
989          break;          break;
990    
991          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
992          discard it. */          avoid setting it. */
993    
994          case OP_WHITESPACE:          case OP_WHITESPACE:
995          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
996            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
997            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
998          break;          break;
999    
1000          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
1001          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
1002          break;          break;
1003    
1004          case OP_WORDCHAR:          case OP_WORDCHAR:
1005          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
1006          break;          break;
1007          }          }
1008    
# Line 958  if ((re->options & PCRE_ANCHORED) == 0 & Line 1165  if ((re->options & PCRE_ANCHORED) == 0 &
1165    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1166    
1167    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
1168    bits_set = set_start_bits(code, start_bits,    bits_set = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,  
1169      &compile_block) == SSB_DONE;      &compile_block) == SSB_DONE;
1170    }    }
1171    
1172  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1173    
1174  min = find_minlength(code, code, re->options);  switch(min = find_minlength(code, code, re->options))
1175      {
1176      case -2: *errorptr = "internal error: missing capturing bracket"; break;
1177      case -3: *errorptr = "internal error: opcode not recognized"; break;
1178      default: break;
1179      }
1180    
1181  /* Return NULL if no optimization is possible. */  /* Return NULL if no optimization is possible. */
1182    

Legend:
Removed from v.520  
changed lines
  Added in v.604

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12