/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 487 by ph10, Wed Jan 6 10:26:55 2010 UTC revision 538 by ph10, Wed Jun 9 19:30:57 2010 UTC
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 413  for (;;) Line 414  for (;;)
414  #endif  #endif
415      break;      break;
416    
417        /* Skip these, but we need to add in the name length. */
418    
419        case OP_MARK:
420        case OP_PRUNE_ARG:
421        case OP_SKIP_ARG:
422        case OP_THEN_ARG:
423        cc += _pcre_OP_lengths[op] + cc[1];
424        break;
425    
426      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
427      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
428      OP_THEN. */      OP_THEN. */
# Line 431  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
445  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
446    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447    when Unicode property support is available.
448    
449  Arguments:  Arguments:
450    start_bits    points to the bit map    start_bits    points to the bit map
451    c             is the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454      utf8          TRUE for UTF-8 mode
455    
456  Returns:        nothing  Returns:        pointer after the character
457  */  */
458    
459  static void  static const uschar *
460  set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,  set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
461    compile_data *cd)    compile_data *cd, BOOL utf8)
462  {  {
463  start_bits[c/8] |= (1 << (c&7));  unsigned int c = *p;
464  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  
465    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  SET_BIT(c);
466    
467    #ifdef SUPPORT_UTF8
468    if (utf8 && c > 127)
469      {
470      GETCHARINC(c, p);
471    #ifdef SUPPORT_UCP
472      if (caseless)
473        {
474        uschar buff[8];
475        c = UCD_OTHERCASE(c);
476        (void)_pcre_ord2utf8(c, buff);
477        SET_BIT(buff[0]);
478        }
479    #endif
480      return p;
481      }
482    #endif
483    
484    /* Not UTF-8 mode, or character is less than 127. */
485    
486    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
487    return p + 1;
488  }  }
489    
490    
# Line 484  set_start_bits(const uschar *code, uscha Line 519  set_start_bits(const uschar *code, uscha
519  {  {
520  register int c;  register int c;
521  int yield = SSB_DONE;  int yield = SSB_DONE;
522    int table_limit = utf8? 16:32;
523    
524  #if 0  #if 0
525  /* ========================================================================= */  /* ========================================================================= */
# Line 607  do Line 643  do
643        case OP_QUERY:        case OP_QUERY:
644        case OP_MINQUERY:        case OP_MINQUERY:
645        case OP_POSQUERY:        case OP_POSQUERY:
646        set_table_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       tcode += 2;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
647        break;        break;
648    
649        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 620  do Line 651  do
651        case OP_UPTO:        case OP_UPTO:
652        case OP_MINUPTO:        case OP_MINUPTO:
653        case OP_POSUPTO:        case OP_POSUPTO:
654        set_table_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
       tcode += 4;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
655        break;        break;
656    
657        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 638  do Line 664  do
664        case OP_PLUS:        case OP_PLUS:
665        case OP_MINPLUS:        case OP_MINPLUS:
666        case OP_POSPLUS:        case OP_POSPLUS:
667        set_table_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
668          try_next = FALSE;
669          break;
670    
671          /* Special spacing and line-terminating items. These recognize specific
672          lists of characters. The difference between VSPACE and ANYNL is that the
673          latter can match the two-character CRLF sequence, but that is not
674          relevant for finding the first character, so their code here is
675          identical. */
676    
677          case OP_HSPACE:
678          SET_BIT(0x09);
679          SET_BIT(0x20);
680          if (utf8)
681            {
682            SET_BIT(0xC2);  /* For U+00A0 */
683            SET_BIT(0xE1);  /* For U+1680, U+180E */
684            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
685            SET_BIT(0xE3);  /* For U+3000 */
686            }
687          else SET_BIT(0xA0);
688          try_next = FALSE;
689          break;
690    
691          case OP_ANYNL:
692          case OP_VSPACE:
693          SET_BIT(0x0A);
694          SET_BIT(0x0B);
695          SET_BIT(0x0C);
696          SET_BIT(0x0D);
697          if (utf8)
698            {
699            SET_BIT(0xC2);  /* For U+0085 */
700            SET_BIT(0xE2);  /* For U+2028, U+2029 */
701            }
702          else SET_BIT(0x85);
703        try_next = FALSE;        try_next = FALSE;
704        break;        break;
705    
706        /* Single character type sets the bits and stops */        /* Single character types set the bits and stop. Note that if PCRE_UCP
707          is set, we do not see these op codes because \d etc are converted to
708          properties. Therefore, these apply in the case when only ASCII characters
709          are recognized to match the types. In UTF-8 mode, we must restrict
710          ourselves to bytes less than 128, as otherwise there can be confusion
711          with bytes in the middle of UTF-8 characters. (In a "traditional"
712          environment, the tables will only recognize ASCII characters anyway, but
713          in at least one Windows environment, some higher bytes bits were set in
714          the tables.) */
715    
716        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
717        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
718          start_bits[c] |= ~cd->cbits[c+cbit_digit];          start_bits[c] |= ~cd->cbits[c+cbit_digit];
719        try_next = FALSE;        try_next = FALSE;
720        break;        break;
721    
722        case OP_DIGIT:        case OP_DIGIT:
723        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
724          start_bits[c] |= cd->cbits[c+cbit_digit];          start_bits[c] |= cd->cbits[c+cbit_digit];
725        try_next = FALSE;        try_next = FALSE;
726        break;        break;
# Line 660  do Line 729  do
729        discard it. */        discard it. */
730    
731        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
732        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
733          {          {
734          int d = cd->cbits[c+cbit_space];          int d = cd->cbits[c+cbit_space];
735          if (c == 1) d &= ~0x08;          if (c == 1) d &= ~0x08;
# Line 673  do Line 742  do
742        discard it. */        discard it. */
743    
744        case OP_WHITESPACE:        case OP_WHITESPACE:
745        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
746          {          {
747          int d = cd->cbits[c+cbit_space];          int d = cd->cbits[c+cbit_space];
748          if (c == 1) d &= ~0x08;          if (c == 1) d &= ~0x08;
# Line 683  do Line 752  do
752        break;        break;
753    
754        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
755        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
756          start_bits[c] |= ~cd->cbits[c+cbit_word];          start_bits[c] |= ~cd->cbits[c+cbit_word];
757        try_next = FALSE;        try_next = FALSE;
758        break;        break;
759    
760        case OP_WORDCHAR:        case OP_WORDCHAR:
761        for (c = 0; c < 32; c++)        for (c = 0; c < table_limit; c++)
762          start_bits[c] |= cd->cbits[c+cbit_word];          start_bits[c] |= cd->cbits[c+cbit_word];
763        try_next = FALSE;        try_next = FALSE;
764        break;        break;
# Line 699  do Line 768  do
768    
769        case OP_TYPEPLUS:        case OP_TYPEPLUS:
770        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
771          case OP_TYPEPOSPLUS:
772        tcode++;        tcode++;
773        break;        break;
774    
# Line 722  do Line 792  do
792        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
793        switch(tcode[1])        switch(tcode[1])
794          {          {
795            default:
796          case OP_ANY:          case OP_ANY:
797          case OP_ALLANY:          case OP_ALLANY:
798          return SSB_FAIL;          return SSB_FAIL;
799    
800            case OP_HSPACE:
801            SET_BIT(0x09);
802            SET_BIT(0x20);
803            if (utf8)
804              {
805              SET_BIT(0xC2);  /* For U+00A0 */
806              SET_BIT(0xE1);  /* For U+1680, U+180E */
807              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
808              SET_BIT(0xE3);  /* For U+3000 */
809              }
810            else SET_BIT(0xA0);
811            break;
812    
813            case OP_ANYNL:
814            case OP_VSPACE:
815            SET_BIT(0x0A);
816            SET_BIT(0x0B);
817            SET_BIT(0x0C);
818            SET_BIT(0x0D);
819            if (utf8)
820              {
821              SET_BIT(0xC2);  /* For U+0085 */
822              SET_BIT(0xE2);  /* For U+2028, U+2029 */
823              }
824            else SET_BIT(0x85);
825            break;
826    
827          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
828          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
829            start_bits[c] |= ~cd->cbits[c+cbit_digit];            start_bits[c] |= ~cd->cbits[c+cbit_digit];
830          break;          break;
831    
832          case OP_DIGIT:          case OP_DIGIT:
833          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
834            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
835          break;          break;
836    
# Line 740  do Line 838  do
838          discard it. */          discard it. */
839    
840          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
841          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
842            {            {
843            int d = cd->cbits[c+cbit_space];            int d = cd->cbits[c+cbit_space];
844            if (c == 1) d &= ~0x08;            if (c == 1) d &= ~0x08;
# Line 752  do Line 850  do
850          discard it. */          discard it. */
851    
852          case OP_WHITESPACE:          case OP_WHITESPACE:
853          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
854            {            {
855            int d = cd->cbits[c+cbit_space];            int d = cd->cbits[c+cbit_space];
856            if (c == 1) d &= ~0x08;            if (c == 1) d &= ~0x08;
# Line 761  do Line 859  do
859          break;          break;
860    
861          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
862          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
863            start_bits[c] |= ~cd->cbits[c+cbit_word];            start_bits[c] |= ~cd->cbits[c+cbit_word];
864          break;          break;
865    
866          case OP_WORDCHAR:          case OP_WORDCHAR:
867          for (c = 0; c < 32; c++)          for (c = 0; c < table_limit; c++)
868            start_bits[c] |= cd->cbits[c+cbit_word];            start_bits[c] |= cd->cbits[c+cbit_word];
869          break;          break;
870          }          }

Legend:
Removed from v.487  
changed lines
  Added in v.538

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12