/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 471 by ph10, Fri Dec 11 15:11:55 2009 UTC revision 545 by ph10, Wed Jun 16 10:51:15 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 413  for (;;) Line 414  for (;;)
414  #endif  #endif
415      break;      break;
416    
417        /* Skip these, but we need to add in the name length. */
418    
419        case OP_MARK:
420        case OP_PRUNE_ARG:
421        case OP_SKIP_ARG:
422        case OP_THEN_ARG:
423        cc += _pcre_OP_lengths[op] + cc[1];
424        break;
425    
426      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
427      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
428      OP_THEN. */      OP_THEN. */
# Line 431  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
445  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
446    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447    when Unicode property support is available.
448    
449  Arguments:  Arguments:
450    start_bits    points to the bit map    start_bits    points to the bit map
451    c             is the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454      utf8          TRUE for UTF-8 mode
455    
456  Returns:        nothing  Returns:        pointer after the character
457    */
458    
459    static const uschar *
460    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
461      compile_data *cd, BOOL utf8)
462    {
463    unsigned int c = *p;
464    
465    SET_BIT(c);
466    
467    #ifdef SUPPORT_UTF8
468    if (utf8 && c > 127)
469      {
470      GETCHARINC(c, p);
471    #ifdef SUPPORT_UCP
472      if (caseless)
473        {
474        uschar buff[8];
475        c = UCD_OTHERCASE(c);
476        (void)_pcre_ord2utf8(c, buff);
477        SET_BIT(buff[0]);
478        }
479    #endif
480      return p;
481      }
482    #endif
483    
484    /* Not UTF-8 mode, or character is less than 127. */
485    
486    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
487    return p + 1;
488    }
489    
490    
491    
492    /*************************************************
493    *     Set bits for a positive character type     *
494    *************************************************/
495    
496    /* This function sets starting bits for a character type. In UTF-8 mode, we can
497    only do a direct setting for bytes less than 128, as otherwise there can be
498    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499    environment, the tables will only recognize ASCII characters anyway, but in at
500    least one Windows environment, some higher bytes bits were set in the tables.
501    So we deal with that case by considering the UTF-8 encoding.
502    
503    Arguments:
504      start_bits     the starting bitmap
505      cbit type      the type of character wanted
506      table_limit    32 for non-UTF-8; 16 for UTF-8
507      cd             the block with char table pointers
508    
509    Returns:         nothing
510  */  */
511    
512  static void  static void
513  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514      compile_data *cd)
515  {  {
516  start_bits[c/8] |= (1 << (c&7));  register int c;
517  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit == 32) return;
519    for (c = 128; c < 256; c++)
520      {
521      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522        {
523        uschar buff[8];
524        (void)_pcre_ord2utf8(c, buff);
525        SET_BIT(buff[0]);
526        }
527      }
528    }
529    
530    
531    /*************************************************
532    *     Set bits for a negative character type     *
533    *************************************************/
534    
535    /* This function sets starting bits for a negative character type such as \D.
536    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538    Unlike in the positive case, where we can set appropriate starting bits for
539    specific high-valued UTF-8 characters, in this case we have to set the bits for
540    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541    0xc0 (192) for simplicity.
542    
543    Arguments:
544      start_bits     the starting bitmap
545      cbit type      the type of character wanted
546      table_limit    32 for non-UTF-8; 16 for UTF-8
547      cd             the block with char table pointers
548    
549    Returns:         nothing
550    */
551    
552    static void
553    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554      compile_data *cd)
555    {
556    register int c;
557    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559  }  }
560    
561    
# Line 483  set_start_bits(const uschar *code, uscha Line 590  set_start_bits(const uschar *code, uscha
590  {  {
591  register int c;  register int c;
592  int yield = SSB_DONE;  int yield = SSB_DONE;
593    int table_limit = utf8? 16:32;
594    
595  #if 0  #if 0
596  /* ========================================================================= */  /* ========================================================================= */
# Line 606  do Line 714  do
714        case OP_QUERY:        case OP_QUERY:
715        case OP_MINQUERY:        case OP_MINQUERY:
716        case OP_POSQUERY:        case OP_POSQUERY:
717        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       tcode += 2;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
718        break;        break;
719    
720        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 619  do Line 722  do
722        case OP_UPTO:        case OP_UPTO:
723        case OP_MINUPTO:        case OP_MINUPTO:
724        case OP_POSUPTO:        case OP_POSUPTO:
725        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
       tcode += 4;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
726        break;        break;
727    
728        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 637  do Line 735  do
735        case OP_PLUS:        case OP_PLUS:
736        case OP_MINPLUS:        case OP_MINPLUS:
737        case OP_POSPLUS:        case OP_POSPLUS:
738        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
739          try_next = FALSE;
740          break;
741    
742          /* Special spacing and line-terminating items. These recognize specific
743          lists of characters. The difference between VSPACE and ANYNL is that the
744          latter can match the two-character CRLF sequence, but that is not
745          relevant for finding the first character, so their code here is
746          identical. */
747    
748          case OP_HSPACE:
749          SET_BIT(0x09);
750          SET_BIT(0x20);
751          if (utf8)
752            {
753            SET_BIT(0xC2);  /* For U+00A0 */
754            SET_BIT(0xE1);  /* For U+1680, U+180E */
755            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
756            SET_BIT(0xE3);  /* For U+3000 */
757            }
758          else SET_BIT(0xA0);
759          try_next = FALSE;
760          break;
761    
762          case OP_ANYNL:
763          case OP_VSPACE:
764          SET_BIT(0x0A);
765          SET_BIT(0x0B);
766          SET_BIT(0x0C);
767          SET_BIT(0x0D);
768          if (utf8)
769            {
770            SET_BIT(0xC2);  /* For U+0085 */
771            SET_BIT(0xE2);  /* For U+2028, U+2029 */
772            }
773          else SET_BIT(0x85);
774        try_next = FALSE;        try_next = FALSE;
775        break;        break;
776    
777        /* Single character type sets the bits and stops */        /* Single character types set the bits and stop. Note that if PCRE_UCP
778          is set, we do not see these op codes because \d etc are converted to
779          properties. Therefore, these apply in the case when only characters less
780          than 256 are recognized to match the types. */
781    
782        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
783        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
784        try_next = FALSE;        try_next = FALSE;
785        break;        break;
786    
787        case OP_DIGIT:        case OP_DIGIT:
788        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
789        try_next = FALSE;        try_next = FALSE;
790        break;        break;
791    
792        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
793        discard it. */        ensure it is set as not whitespace. */
794    
795        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
796        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
798        try_next = FALSE;        try_next = FALSE;
799        break;        break;
800    
801        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
802        discard it. */        not set it from the table. */
803    
804        case OP_WHITESPACE:        case OP_WHITESPACE:
805        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
806          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
807          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
808        try_next = FALSE;        try_next = FALSE;
809        break;        break;
810    
811        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
812        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
813        try_next = FALSE;        try_next = FALSE;
814        break;        break;
815    
816        case OP_WORDCHAR:        case OP_WORDCHAR:
817        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
818        try_next = FALSE;        try_next = FALSE;
819        break;        break;
820    
# Line 698  do Line 823  do
823    
824        case OP_TYPEPLUS:        case OP_TYPEPLUS:
825        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
826          case OP_TYPEPOSPLUS:
827        tcode++;        tcode++;
828        break;        break;
829    
# Line 721  do Line 847  do
847        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
848        switch(tcode[1])        switch(tcode[1])
849          {          {
850            default:
851          case OP_ANY:          case OP_ANY:
852          case OP_ALLANY:          case OP_ALLANY:
853          return SSB_FAIL;          return SSB_FAIL;
854    
855            case OP_HSPACE:
856            SET_BIT(0x09);
857            SET_BIT(0x20);
858            if (utf8)
859              {
860              SET_BIT(0xC2);  /* For U+00A0 */
861              SET_BIT(0xE1);  /* For U+1680, U+180E */
862              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
863              SET_BIT(0xE3);  /* For U+3000 */
864              }
865            else SET_BIT(0xA0);
866            break;
867    
868            case OP_ANYNL:
869            case OP_VSPACE:
870            SET_BIT(0x0A);
871            SET_BIT(0x0B);
872            SET_BIT(0x0C);
873            SET_BIT(0x0D);
874            if (utf8)
875              {
876              SET_BIT(0xC2);  /* For U+0085 */
877              SET_BIT(0xE2);  /* For U+2028, U+2029 */
878              }
879            else SET_BIT(0x85);
880            break;
881    
882          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
883          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
884          break;          break;
885    
886          case OP_DIGIT:          case OP_DIGIT:
887          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
888          break;          break;
889    
890          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
891          discard it. */          ensure it gets set as not whitespace. */
892    
893          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
894          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
896          break;          break;
897    
898          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
899          discard it. */          avoid setting it. */
900    
901          case OP_WHITESPACE:          case OP_WHITESPACE:
902          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
903            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
904            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
905          break;          break;
906    
907          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
908          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
909          break;          break;
910    
911          case OP_WORDCHAR:          case OP_WORDCHAR:
912          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
913          break;          break;
914          }          }
915    

Legend:
Removed from v.471  
changed lines
  Added in v.545

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12