/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 467 by ph10, Mon Oct 19 11:43:18 2009 UTC revision 539 by ph10, Sun Jun 13 21:35:04 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 96  for (;;) Line 97  for (;;)
97    
98    switch (op)    switch (op)
99      {      {
100        case OP_COND:
101        case OP_SCOND:
102    
103        /* If there is only one branch in a condition, the implied branch has zero
104        length, so we don't add anything. This covers the DEFINE "condition"
105        automatically. */
106    
107        cs = cc + GET(cc, 1);
108        if (*cs != OP_ALT)
109          {
110          cc = cs + 1 + LINK_SIZE;
111          break;
112          }
113    
114        /* Otherwise we can fall through and treat it the same as any other
115        subpattern. */
116    
117      case OP_CBRA:      case OP_CBRA:
118      case OP_SCBRA:      case OP_SCBRA:
119      case OP_BRA:      case OP_BRA:
120      case OP_SBRA:      case OP_SBRA:
121      case OP_ONCE:      case OP_ONCE:
     case OP_COND:  
     case OP_SCOND:  
122      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options);
123      if (d < 0) return d;      if (d < 0) return d;
124      branchlength += d;      branchlength += d;
# Line 314  for (;;) Line 330  for (;;)
330      logic is that a recursion can only make sense if there is another      logic is that a recursion can only make sense if there is another
331      alternation that stops the recursing. That will provide the minimum length      alternation that stops the recursing. That will provide the minimum length
332      (when no recursion happens). A backreference within the group that it is      (when no recursion happens). A backreference within the group that it is
333      referencing behaves in the same way.      referencing behaves in the same way.
334    
335      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket      If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
336      matches an empty string (by default it causes a matching failure), so in      matches an empty string (by default it causes a matching failure), so in
337      that case we must set the minimum length to zero. */      that case we must set the minimum length to zero. */
338    
339      case OP_REF:      case OP_REF:
340      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
341        {        {
342        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
343        if (cs == NULL) return -2;        if (cs == NULL) return -2;
344        do ce += GET(ce, 1); while (*ce == OP_ALT);        do ce += GET(ce, 1); while (*ce == OP_ALT);
# Line 333  for (;;) Line 349  for (;;)
349          }          }
350        else d = find_minlength(cs, startcode, options);        else d = find_minlength(cs, startcode, options);
351        }        }
352      else d = 0;      else d = 0;
353      cc += 3;      cc += 3;
354    
355      /* Handle repeated back references */      /* Handle repeated back references */
# Line 398  for (;;) Line 414  for (;;)
414  #endif  #endif
415      break;      break;
416    
417        /* Skip these, but we need to add in the name length. */
418    
419        case OP_MARK:
420        case OP_PRUNE_ARG:
421        case OP_SKIP_ARG:
422        case OP_THEN_ARG:
423        cc += _pcre_OP_lengths[op] + cc[1];
424        break;
425    
426      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
427      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
428      OP_THEN. */      OP_THEN. */
# Line 416  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
445  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
446    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447    when Unicode property support is available.
448    
449  Arguments:  Arguments:
450    start_bits    points to the bit map    start_bits    points to the bit map
451    c             is the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454      utf8          TRUE for UTF-8 mode
455    
456    Returns:        pointer after the character
457    */
458    
459    static const uschar *
460    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
461      compile_data *cd, BOOL utf8)
462    {
463    unsigned int c = *p;
464    
465    SET_BIT(c);
466    
467    #ifdef SUPPORT_UTF8
468    if (utf8 && c > 127)
469      {
470      GETCHARINC(c, p);
471    #ifdef SUPPORT_UCP
472      if (caseless)
473        {
474        uschar buff[8];
475        c = UCD_OTHERCASE(c);
476        (void)_pcre_ord2utf8(c, buff);
477        SET_BIT(buff[0]);
478        }
479    #endif
480      return p;
481      }
482    #endif
483    
484    /* Not UTF-8 mode, or character is less than 127. */
485    
486    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
487    return p + 1;
488    }
489    
490    
491    
492    /*************************************************
493    *     Set bits for a positive character type     *
494    *************************************************/
495    
496    /* This function sets starting bits for a character type. In UTF-8 mode, we can
497    only do a direct setting for bytes less than 128, as otherwise there can be
498    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499    environment, the tables will only recognize ASCII characters anyway, but in at
500    least one Windows environment, some higher bytes bits were set in the tables.
501    So we deal with that case by considering the UTF-8 encoding.
502    
503    Arguments:
504      start_bits     the starting bitmap
505      cbit type      the type of character wanted
506      table_limit    32 for non-UTF-8; 16 for UTF-8
507      cd             the block with char table pointers
508    
509    Returns:         nothing
510    */
511    
512    static void
513    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514      compile_data *cd)
515    {
516    register int c;
517    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518    if (table_limit == 32) return;
519    for (c = 128; c < 256; c++)
520      {
521      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522        {
523        uschar buff[8];
524        (void)_pcre_ord2utf8(c, buff);
525        SET_BIT(buff[0]);
526        }
527      }
528    }
529    
530    
531    /*************************************************
532    *     Set bits for a negative character type     *
533    *************************************************/
534    
535    /* This function sets starting bits for a negative character type such as \D.
536    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538    Unlike in the positive case, where we can set appropriate starting bits for
539    specific high-valued UTF-8 characters, in this case we have to set the bits for
540    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541    0xc0 (192) for simplicity.
542    
543    Arguments:
544      start_bits     the starting bitmap
545      cbit type      the type of character wanted
546      table_limit    32 for non-UTF-8; 16 for UTF-8
547      cd             the block with char table pointers
548    
549  Returns:        nothing  Returns:         nothing
550  */  */
551    
552  static void  static void
553  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554      compile_data *cd)
555  {  {
556  start_bits[c/8] |= (1 << (c&7));  register int c;
557  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559  }  }
560    
561    
# Line 468  set_start_bits(const uschar *code, uscha Line 590  set_start_bits(const uschar *code, uscha
590  {  {
591  register int c;  register int c;
592  int yield = SSB_DONE;  int yield = SSB_DONE;
593    int table_limit = utf8? 16:32;
594    
595  #if 0  #if 0
596  /* ========================================================================= */  /* ========================================================================= */
# Line 591  do Line 714  do
714        case OP_QUERY:        case OP_QUERY:
715        case OP_MINQUERY:        case OP_MINQUERY:
716        case OP_POSQUERY:        case OP_POSQUERY:
717        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       tcode += 2;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
718        break;        break;
719    
720        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 604  do Line 722  do
722        case OP_UPTO:        case OP_UPTO:
723        case OP_MINUPTO:        case OP_MINUPTO:
724        case OP_POSUPTO:        case OP_POSUPTO:
725        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
       tcode += 4;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
726        break;        break;
727    
728        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 622  do Line 735  do
735        case OP_PLUS:        case OP_PLUS:
736        case OP_MINPLUS:        case OP_MINPLUS:
737        case OP_POSPLUS:        case OP_POSPLUS:
738        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
739        try_next = FALSE;        try_next = FALSE;
740        break;        break;
741    
742        /* Single character type sets the bits and stops */        /* Special spacing and line-terminating items. These recognize specific
743          lists of characters. The difference between VSPACE and ANYNL is that the
744          latter can match the two-character CRLF sequence, but that is not
745          relevant for finding the first character, so their code here is
746          identical. */
747    
748          case OP_HSPACE:
749          SET_BIT(0x09);
750          SET_BIT(0x20);
751          if (utf8)
752            {
753            SET_BIT(0xC2);  /* For U+00A0 */
754            SET_BIT(0xE1);  /* For U+1680, U+180E */
755            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
756            SET_BIT(0xE3);  /* For U+3000 */
757            }
758          else SET_BIT(0xA0);
759          try_next = FALSE;
760          break;
761    
762          case OP_ANYNL:
763          case OP_VSPACE:
764          SET_BIT(0x0A);
765          SET_BIT(0x0B);
766          SET_BIT(0x0C);
767          SET_BIT(0x0D);
768          if (utf8)
769            {
770            SET_BIT(0xC2);  /* For U+0085 */
771            SET_BIT(0xE2);  /* For U+2028, U+2029 */
772            }
773          else SET_BIT(0x85);
774          try_next = FALSE;
775          break;
776    
777          /* Single character types set the bits and stop. Note that if PCRE_UCP
778          is set, we do not see these op codes because \d etc are converted to
779          properties. Therefore, these apply in the case when only characters less
780          than 256 are recognized to match the types. */
781    
782        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
783        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
784        try_next = FALSE;        try_next = FALSE;
785        break;        break;
786    
787        case OP_DIGIT:        case OP_DIGIT:
788        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
789        try_next = FALSE;        try_next = FALSE;
790        break;        break;
791    
792        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
793        discard it. */        ensure it is set as not whitespace. */
794    
795        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
796        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
798        try_next = FALSE;        try_next = FALSE;
799        break;        break;
800    
801        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
802        discard it. */        not set it from the table. */
803    
804        case OP_WHITESPACE:        case OP_WHITESPACE:
805        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
806          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
807          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
808        try_next = FALSE;        try_next = FALSE;
809        break;        break;
810    
811        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
812        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
813        try_next = FALSE;        try_next = FALSE;
814        break;        break;
815    
816        case OP_WORDCHAR:        case OP_WORDCHAR:
817        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
818        try_next = FALSE;        try_next = FALSE;
819        break;        break;
820    
821        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
822        it will hit a single character type and stop there. */        it will hit a single character type and stop there. */
823    
824        case OP_TYPEPLUS:        case OP_TYPEPLUS:
825        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
826          case OP_TYPEPOSPLUS:
827        tcode++;        tcode++;
828        break;        break;
829    
# Line 706  do Line 847  do
847        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
848        switch(tcode[1])        switch(tcode[1])
849          {          {
850            default:
851          case OP_ANY:          case OP_ANY:
852          case OP_ALLANY:          case OP_ALLANY:
853          return SSB_FAIL;          return SSB_FAIL;
854    
855            case OP_HSPACE:
856            SET_BIT(0x09);
857            SET_BIT(0x20);
858            if (utf8)
859              {
860              SET_BIT(0xC2);  /* For U+00A0 */
861              SET_BIT(0xE1);  /* For U+1680, U+180E */
862              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
863              SET_BIT(0xE3);  /* For U+3000 */
864              }
865            else SET_BIT(0xA0);
866            break;
867    
868            case OP_ANYNL:
869            case OP_VSPACE:
870            SET_BIT(0x0A);
871            SET_BIT(0x0B);
872            SET_BIT(0x0C);
873            SET_BIT(0x0D);
874            if (utf8)
875              {
876              SET_BIT(0xC2);  /* For U+0085 */
877              SET_BIT(0xE2);  /* For U+2028, U+2029 */
878              }
879            else SET_BIT(0x85);
880            break;
881    
882          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
883          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
884          break;          break;
885    
886          case OP_DIGIT:          case OP_DIGIT:
887          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
888          break;          break;
889    
890          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
891          discard it. */          ensure it gets set as not whitespace. */
892    
893          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
894          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
896          break;          break;
897    
898          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
899          discard it. */          avoid setting it. */
900    
901          case OP_WHITESPACE:          case OP_WHITESPACE:
902          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
903            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
904            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
905          break;          break;
906    
907          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
908          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
909          break;          break;
910    
911          case OP_WORDCHAR:          case OP_WORDCHAR:
912          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
913          break;          break;
914          }          }
915    

Legend:
Removed from v.467  
changed lines
  Added in v.539

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12