/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 538 by ph10, Wed Jun 9 19:30:57 2010 UTC revision 539 by ph10, Sun Jun 13 21:35:04 2010 UTC
# Line 490  return p + 1; Line 490  return p + 1;
490    
491    
492  /*************************************************  /*************************************************
493    *     Set bits for a positive character type     *
494    *************************************************/
495    
496    /* This function sets starting bits for a character type. In UTF-8 mode, we can
497    only do a direct setting for bytes less than 128, as otherwise there can be
498    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499    environment, the tables will only recognize ASCII characters anyway, but in at
500    least one Windows environment, some higher bytes bits were set in the tables.
501    So we deal with that case by considering the UTF-8 encoding.
502    
503    Arguments:
504      start_bits     the starting bitmap
505      cbit type      the type of character wanted
506      table_limit    32 for non-UTF-8; 16 for UTF-8
507      cd             the block with char table pointers
508    
509    Returns:         nothing
510    */
511    
512    static void
513    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514      compile_data *cd)
515    {
516    register int c;
517    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518    if (table_limit == 32) return;
519    for (c = 128; c < 256; c++)
520      {
521      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522        {
523        uschar buff[8];
524        (void)_pcre_ord2utf8(c, buff);
525        SET_BIT(buff[0]);
526        }
527      }
528    }
529    
530    
531    /*************************************************
532    *     Set bits for a negative character type     *
533    *************************************************/
534    
535    /* This function sets starting bits for a negative character type such as \D.
536    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538    Unlike in the positive case, where we can set appropriate starting bits for
539    specific high-valued UTF-8 characters, in this case we have to set the bits for
540    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541    0xc0 (192) for simplicity.
542    
543    Arguments:
544      start_bits     the starting bitmap
545      cbit type      the type of character wanted
546      table_limit    32 for non-UTF-8; 16 for UTF-8
547      cd             the block with char table pointers
548    
549    Returns:         nothing
550    */
551    
552    static void
553    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554      compile_data *cd)
555    {
556    register int c;
557    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559    }
560    
561    
562    
563    /*************************************************
564  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
565  *************************************************/  *************************************************/
566    
# Line 705  do Line 776  do
776    
777        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
778        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
779        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
780        are recognized to match the types. In UTF-8 mode, we must restrict        than 256 are recognized to match the types. */
       ourselves to bytes less than 128, as otherwise there can be confusion  
       with bytes in the middle of UTF-8 characters. (In a "traditional"  
       environment, the tables will only recognize ASCII characters anyway, but  
       in at least one Windows environment, some higher bytes bits were set in  
       the tables.) */  
781    
782        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
783        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
784        try_next = FALSE;        try_next = FALSE;
785        break;        break;
786    
787        case OP_DIGIT:        case OP_DIGIT:
788        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
789        try_next = FALSE;        try_next = FALSE;
790        break;        break;
791    
792        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
793        discard it. */        ensure it is set as not whitespace. */
794    
795        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
796        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
798        try_next = FALSE;        try_next = FALSE;
799        break;        break;
800    
801        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
802        discard it. */        not set it from the table. */
803    
804        case OP_WHITESPACE:        case OP_WHITESPACE:
805        for (c = 0; c < table_limit; c++)        c = start_bits[1];    /* Save in case it was already set */
806          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
807          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
808        try_next = FALSE;        try_next = FALSE;
809        break;        break;
810    
811        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
812        for (c = 0; c < table_limit; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
813        try_next = FALSE;        try_next = FALSE;
814        break;        break;
815    
816        case OP_WORDCHAR:        case OP_WORDCHAR:
817        for (c = 0; c < table_limit; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
818        try_next = FALSE;        try_next = FALSE;
819        break;        break;
820    
821        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
822        it will hit a single character type and stop there. */        it will hit a single character type and stop there. */
823    
# Line 825  do Line 880  do
880          break;          break;
881    
882          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
883          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
884          break;          break;
885    
886          case OP_DIGIT:          case OP_DIGIT:
887          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
888          break;          break;
889    
890          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
891          discard it. */          ensure it gets set as not whitespace. */
892    
893          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
894          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
896          break;          break;
897    
898          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
899          discard it. */          avoid setting it. */
900    
901          case OP_WHITESPACE:          case OP_WHITESPACE:
902          for (c = 0; c < table_limit; c++)          c = start_bits[1];    /* Save in case it was already set */
903            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
904            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
905          break;          break;
906    
907          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
908          for (c = 0; c < table_limit; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
909          break;          break;
910    
911          case OP_WORDCHAR:          case OP_WORDCHAR:
912          for (c = 0; c < table_limit; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
913          break;          break;
914          }          }
915    

Legend:
Removed from v.538  
changed lines
  Added in v.539

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12