/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 524 by ph10, Mon May 24 17:06:28 2010 UTC revision 545 by ph10, Wed Jun 16 10:51:15 2010 UTC
# Line 441  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its first byte's bit in the table, and also the  /* Given a character, set its first byte's bit in the table, and also the
445  corresponding bit for the other version of a letter if we are caseless. In  corresponding bit for the other version of a letter if we are caseless. In
446  UTF-8 mode, for characters greater than 127, we can only do the caseless thing  UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447  when Unicode property support is available.  when Unicode property support is available.
# Line 451  Arguments: Line 451  Arguments:
451    p             points to the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454    utf8          TRUE for UTF-8 mode    utf8          TRUE for UTF-8 mode
455    
456  Returns:        pointer after the character  Returns:        pointer after the character
457  */  */
# Line 471  if (utf8 && c > 127) Line 471  if (utf8 && c > 127)
471  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
472    if (caseless)    if (caseless)
473      {      {
474      uschar buff[8];      uschar buff[8];
475      c = UCD_OTHERCASE(c);      c = UCD_OTHERCASE(c);
476      (void)_pcre_ord2utf8(c, buff);      (void)_pcre_ord2utf8(c, buff);
477      SET_BIT(buff[0]);      SET_BIT(buff[0]);
478      }      }
479  #endif  #endif
480    return p;    return p;
481    }    }
482  #endif  #endif
483    
484  /* Not UTF-8 mode, or character is less than 127. */  /* Not UTF-8 mode, or character is less than 127. */
485    
# Line 490  return p + 1; Line 490  return p + 1;
490    
491    
492  /*************************************************  /*************************************************
493    *     Set bits for a positive character type     *
494    *************************************************/
495    
496    /* This function sets starting bits for a character type. In UTF-8 mode, we can
497    only do a direct setting for bytes less than 128, as otherwise there can be
498    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499    environment, the tables will only recognize ASCII characters anyway, but in at
500    least one Windows environment, some higher bytes bits were set in the tables.
501    So we deal with that case by considering the UTF-8 encoding.
502    
503    Arguments:
504      start_bits     the starting bitmap
505      cbit type      the type of character wanted
506      table_limit    32 for non-UTF-8; 16 for UTF-8
507      cd             the block with char table pointers
508    
509    Returns:         nothing
510    */
511    
512    static void
513    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514      compile_data *cd)
515    {
516    register int c;
517    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518    if (table_limit == 32) return;
519    for (c = 128; c < 256; c++)
520      {
521      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522        {
523        uschar buff[8];
524        (void)_pcre_ord2utf8(c, buff);
525        SET_BIT(buff[0]);
526        }
527      }
528    }
529    
530    
531    /*************************************************
532    *     Set bits for a negative character type     *
533    *************************************************/
534    
535    /* This function sets starting bits for a negative character type such as \D.
536    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538    Unlike in the positive case, where we can set appropriate starting bits for
539    specific high-valued UTF-8 characters, in this case we have to set the bits for
540    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541    0xc0 (192) for simplicity.
542    
543    Arguments:
544      start_bits     the starting bitmap
545      cbit type      the type of character wanted
546      table_limit    32 for non-UTF-8; 16 for UTF-8
547      cd             the block with char table pointers
548    
549    Returns:         nothing
550    */
551    
552    static void
553    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554      compile_data *cd)
555    {
556    register int c;
557    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559    }
560    
561    
562    
563    /*************************************************
564  *          Create bitmap of starting bytes       *  *          Create bitmap of starting bytes       *
565  *************************************************/  *************************************************/
566    
# Line 519  set_start_bits(const uschar *code, uscha Line 590  set_start_bits(const uschar *code, uscha
590  {  {
591  register int c;  register int c;
592  int yield = SSB_DONE;  int yield = SSB_DONE;
593    int table_limit = utf8? 16:32;
594    
595  #if 0  #if 0
596  /* ========================================================================= */  /* ========================================================================= */
# Line 666  do Line 738  do
738        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
739        try_next = FALSE;        try_next = FALSE;
740        break;        break;
741    
742        /* Special spacing and line-terminating items. These recognize specific        /* Special spacing and line-terminating items. These recognize specific
743        lists of characters. The difference between VSPACE and ANYNL is that the        lists of characters. The difference between VSPACE and ANYNL is that the
744        latter can match the two-character CRLF sequence, but that is not        latter can match the two-character CRLF sequence, but that is not
745        relevant for finding the first character, so their code here is        relevant for finding the first character, so their code here is
746        identical. */        identical. */
747    
748        case OP_HSPACE:        case OP_HSPACE:
749        SET_BIT(0x09);        SET_BIT(0x09);
750        SET_BIT(0x20);        SET_BIT(0x20);
       SET_BIT(0xA0);  
751        if (utf8)        if (utf8)
752          {          {
753            SET_BIT(0xC2);  /* For U+00A0 */
754          SET_BIT(0xE1);  /* For U+1680, U+180E */          SET_BIT(0xE1);  /* For U+1680, U+180E */
755          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
756          SET_BIT(0xE3);  /* For U+3000 */          SET_BIT(0xE3);  /* For U+3000 */
757          }          }
758          else SET_BIT(0xA0);
759        try_next = FALSE;        try_next = FALSE;
760        break;        break;
761    
762        case OP_ANYNL:        case OP_ANYNL:
763        case OP_VSPACE:        case OP_VSPACE:
764        SET_BIT(0x0A);        SET_BIT(0x0A);
765        SET_BIT(0x0B);        SET_BIT(0x0B);
766        SET_BIT(0x0C);        SET_BIT(0x0C);
767        SET_BIT(0x0D);        SET_BIT(0x0D);
768        SET_BIT(0x85);        if (utf8)
769        if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */          {
770            SET_BIT(0xC2);  /* For U+0085 */
771            SET_BIT(0xE2);  /* For U+2028, U+2029 */
772            }
773          else SET_BIT(0x85);
774        try_next = FALSE;        try_next = FALSE;
775        break;        break;
776    
777        /* Single character types set the bits and stop. Note that if PCRE_UCP        /* Single character types set the bits and stop. Note that if PCRE_UCP
778        is set, we do not see these op codes because \d etc are converted to        is set, we do not see these op codes because \d etc are converted to
779        properties. Therefore, these apply in the case when only ASCII characters        properties. Therefore, these apply in the case when only characters less
780        are recognized to match the types. */        than 256 are recognized to match the types. */
781    
782        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
783        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
784        try_next = FALSE;        try_next = FALSE;
785        break;        break;
786    
787        case OP_DIGIT:        case OP_DIGIT:
788        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
789        try_next = FALSE;        try_next = FALSE;
790        break;        break;
791    
792        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
793        discard it. */        ensure it is set as not whitespace. */
794    
795        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
796        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
798        try_next = FALSE;        try_next = FALSE;
799        break;        break;
800    
801        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
802        discard it. */        not set it from the table. */
803    
804        case OP_WHITESPACE:        case OP_WHITESPACE:
805        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
806          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
807          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
808        try_next = FALSE;        try_next = FALSE;
809        break;        break;
810    
811        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
812        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
813        try_next = FALSE;        try_next = FALSE;
814        break;        break;
815    
816        case OP_WORDCHAR:        case OP_WORDCHAR:
817        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
818        try_next = FALSE;        try_next = FALSE;
819        break;        break;
820    
# Line 757  do Line 823  do
823    
824        case OP_TYPEPLUS:        case OP_TYPEPLUS:
825        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
826        case OP_TYPEPOSPLUS:        case OP_TYPEPOSPLUS:
827        tcode++;        tcode++;
828        break;        break;
829    
# Line 785  do Line 851  do
851          case OP_ANY:          case OP_ANY:
852          case OP_ALLANY:          case OP_ALLANY:
853          return SSB_FAIL;          return SSB_FAIL;
854    
855          case OP_HSPACE:          case OP_HSPACE:
856          SET_BIT(0x09);          SET_BIT(0x09);
857          SET_BIT(0x20);          SET_BIT(0x20);
         SET_BIT(0xA0);  
858          if (utf8)          if (utf8)
859            {            {
860              SET_BIT(0xC2);  /* For U+00A0 */
861            SET_BIT(0xE1);  /* For U+1680, U+180E */            SET_BIT(0xE1);  /* For U+1680, U+180E */
862            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
863            SET_BIT(0xE3);  /* For U+3000 */            SET_BIT(0xE3);  /* For U+3000 */
864            }            }
865          break;          else SET_BIT(0xA0);
866            break;
867          case OP_ANYNL:  
868            case OP_ANYNL:
869          case OP_VSPACE:          case OP_VSPACE:
870          SET_BIT(0x0A);          SET_BIT(0x0A);
871          SET_BIT(0x0B);          SET_BIT(0x0B);
872          SET_BIT(0x0C);          SET_BIT(0x0C);
873          SET_BIT(0x0D);          SET_BIT(0x0D);
874          SET_BIT(0x85);          if (utf8)
875          if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */            {
876          break;            SET_BIT(0xC2);  /* For U+0085 */
877              SET_BIT(0xE2);  /* For U+2028, U+2029 */
878              }
879            else SET_BIT(0x85);
880            break;
881    
882          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
883          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
884          break;          break;
885    
886          case OP_DIGIT:          case OP_DIGIT:
887          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
888          break;          break;
889    
890          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
891          discard it. */          ensure it gets set as not whitespace. */
892    
893          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
894          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
896          break;          break;
897    
898          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
899          discard it. */          avoid setting it. */
900    
901          case OP_WHITESPACE:          case OP_WHITESPACE:
902          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
903            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
904            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
905          break;          break;
906    
907          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
908          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
909          break;          break;
910    
911          case OP_WORDCHAR:          case OP_WORDCHAR:
912          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
913          break;          break;
914          }          }
915    

Legend:
Removed from v.524  
changed lines
  Added in v.545

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12