/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 455 by ph10, Sat Sep 26 19:12:32 2009 UTC revision 545 by ph10, Wed Jun 16 10:51:15 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
# Line 60  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE Line 61  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE
61  *************************************************/  *************************************************/
62    
63  /* Scan a parenthesized group and compute the minimum length of subject that  /* Scan a parenthesized group and compute the minimum length of subject that
64  is needed to match it. This is a lower bound; it does not mean there is a  is needed to match it. This is a lower bound; it does not mean there is a
65  string of that length that matches. In UTF8 mode, the result is in characters  string of that length that matches. In UTF8 mode, the result is in characters
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code       pointer to start of group (the bracket)    code       pointer to start of group (the bracket)
70    startcode  pointer to start of the whole pattern    startcode  pointer to start of the whole pattern
71    options    the compiling options    options    the compiling options
72    
73  Returns:   the minimum length  Returns:   the minimum length
74             -1 if \C was encountered             -1 if \C was encountered
75             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
76  */  */
77    
78  static int  static int
# Line 91  branch, check the length against that of Line 92  branch, check the length against that of
92  for (;;)  for (;;)
93    {    {
94    int d, min;    int d, min;
95    uschar *cs, *ce;    uschar *cs, *ce;
96    register int op = *cc;    register int op = *cc;
97    
98    switch (op)    switch (op)
99      {      {
100        case OP_COND:
101        case OP_SCOND:
102    
103        /* If there is only one branch in a condition, the implied branch has zero
104        length, so we don't add anything. This covers the DEFINE "condition"
105        automatically. */
106    
107        cs = cc + GET(cc, 1);
108        if (*cs != OP_ALT)
109          {
110          cc = cs + 1 + LINK_SIZE;
111          break;
112          }
113    
114        /* Otherwise we can fall through and treat it the same as any other
115        subpattern. */
116    
117      case OP_CBRA:      case OP_CBRA:
118      case OP_SCBRA:      case OP_SCBRA:
119      case OP_BRA:      case OP_BRA:
120      case OP_SBRA:      case OP_SBRA:
121      case OP_ONCE:      case OP_ONCE:
     case OP_COND:  
     case OP_SCOND:  
122      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options);
123      if (d < 0) return d;      if (d < 0) return d;
124      branchlength += d;      branchlength += d;
# Line 119  for (;;) Line 135  for (;;)
135      case OP_KETRMAX:      case OP_KETRMAX:
136      case OP_KETRMIN:      case OP_KETRMIN:
137      case OP_END:      case OP_END:
138      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
139        length = branchlength;        length = branchlength;
140      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
141      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
142      branchlength = 0;      branchlength = 0;
143      had_recurse = FALSE;      had_recurse = FALSE;
144      break;      break;
145    
146      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
# Line 140  for (;;) Line 156  for (;;)
156    
157      case OP_REVERSE:      case OP_REVERSE:
158      case OP_CREF:      case OP_CREF:
159        case OP_NCREF:
160      case OP_RREF:      case OP_RREF:
161        case OP_NRREF:
162      case OP_DEF:      case OP_DEF:
163      case OP_OPT:      case OP_OPT:
164      case OP_CALLOUT:      case OP_CALLOUT:
# Line 154  for (;;) Line 172  for (;;)
172      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
173      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
174      break;      break;
175    
176      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
177    
178      case OP_BRAZERO:      case OP_BRAZERO:
179      case OP_BRAMINZERO:      case OP_BRAMINZERO:
180      case OP_SKIPZERO:      case OP_SKIPZERO:
181      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
182      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 182  for (;;) Line 200  for (;;)
200      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
201  #endif  #endif
202      break;      break;
203    
204      case OP_TYPEPLUS:      case OP_TYPEPLUS:
205      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
206      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
207      branchlength++;      branchlength++;
208      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
209      break;      break;
# Line 194  for (;;) Line 212  for (;;)
212      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
213    
214      case OP_EXACT:      case OP_EXACT:
215      case OP_NOTEXACT:      case OP_NOTEXACT:
216      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
217      cc += 4;      cc += 4;
218  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 223  for (;;) Line 241  for (;;)
241      case OP_ANY:      case OP_ANY:
242      case OP_ALLANY:      case OP_ALLANY:
243      case OP_EXTUNI:      case OP_EXTUNI:
244      case OP_HSPACE:      case OP_HSPACE:
245      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
246      case OP_VSPACE:      case OP_VSPACE:
247      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
248      branchlength++;      branchlength++;
249      cc++;      cc++;
250      break;      break;
251    
252      /* "Any newline" might match two characters */      /* "Any newline" might match two characters */
253    
254      case OP_ANYNL:      case OP_ANYNL:
255      branchlength += 2;      branchlength += 2;
256      cc++;      cc++;
257      break;      break;
258    
259      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode */
260    
# Line 246  for (;;) Line 264  for (;;)
264  #endif  #endif
265      branchlength++;      branchlength++;
266      cc++;      cc++;
267      break;      break;
268    
269      /* For repeated character types, we have to test for \p and \P, which have      /* For repeated character types, we have to test for \p and \P, which have
270      an extra two bytes of parameters. */      an extra two bytes of parameters. */
# Line 285  for (;;) Line 303  for (;;)
303        case OP_CRPLUS:        case OP_CRPLUS:
304        case OP_CRMINPLUS:        case OP_CRMINPLUS:
305        branchlength++;        branchlength++;
306        /* Fall through */        /* Fall through */
307    
308        case OP_CRSTAR:        case OP_CRSTAR:
309        case OP_CRMINSTAR:        case OP_CRMINSTAR:
310        case OP_CRQUERY:        case OP_CRQUERY:
311        case OP_CRMINQUERY:        case OP_CRMINQUERY:
312        cc++;        cc++;
313        break;        break;
314    
315        case OP_CRRANGE:        case OP_CRRANGE:
316        case OP_CRMINRANGE:        case OP_CRMINRANGE:
317        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
318        cc += 5;        cc += 5;
319        break;        break;
320    
321        default:        default:
322        branchlength++;        branchlength++;
323        break;        break;
324        }        }
325      break;      break;
326    
327      /* Backreferences and subroutine calls are treated in the same way: we find      /* Backreferences and subroutine calls are treated in the same way: we find
328      the minimum length for the subpattern. A recursion, however, causes an      the minimum length for the subpattern. A recursion, however, causes an
329      a flag to be set that causes the length of this branch to be ignored. The      a flag to be set that causes the length of this branch to be ignored. The
330      logic is that a recursion can only make sense if there is another      logic is that a recursion can only make sense if there is another
331      alternation that stops the recursing. That will provide the minimum length      alternation that stops the recursing. That will provide the minimum length
332      (when no recursion happens). A backreference within the group that it is      (when no recursion happens). A backreference within the group that it is
333      referencing behaves in the same way. */      referencing behaves in the same way.
334    
335        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
336        matches an empty string (by default it causes a matching failure), so in
337        that case we must set the minimum length to zero. */
338    
339      case OP_REF:      case OP_REF:
340      ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
     if (cs == NULL) return -2;  
     do ce += GET(ce, 1); while (*ce == OP_ALT);  
     if (cc > cs && cc < ce)  
341        {        {
342        d = 0;        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
343        had_recurse = TRUE;        if (cs == NULL) return -2;
344        }        do ce += GET(ce, 1); while (*ce == OP_ALT);
345      else d = find_minlength(cs, startcode, options);        if (cc > cs && cc < ce)
346      cc += 3;          {
347            d = 0;
348            had_recurse = TRUE;
349            }
350          else d = find_minlength(cs, startcode, options);
351          }
352        else d = 0;
353        cc += 3;
354    
355      /* Handle repeated back references */      /* Handle repeated back references */
356    
357      switch (*cc)      switch (*cc)
358        {        {
359        case OP_CRSTAR:        case OP_CRSTAR:
# Line 337  for (;;) Line 363  for (;;)
363        min = 0;        min = 0;
364        cc++;        cc++;
365        break;        break;
366    
367        case OP_CRRANGE:        case OP_CRRANGE:
368        case OP_CRMINRANGE:        case OP_CRMINRANGE:
369        min = GET2(cc, 1);        min = GET2(cc, 1);
370        cc += 5;        cc += 5;
371        break;        break;
372    
373        default:        default:
374        min = 1;        min = 1;
375        break;        break;
376        }        }
377    
378      branchlength += min * d;      branchlength += min * d;
379      break;      break;
380    
381      case OP_RECURSE:      case OP_RECURSE:
382      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (uschar *)startcode + GET(cc, 1);
383      if (cs == NULL) return -2;      if (cs == NULL) return -2;
384      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
385      if (cc > cs && cc < ce)      if (cc > cs && cc < ce)
386        had_recurse = TRUE;        had_recurse = TRUE;
387      else      else
388        branchlength += find_minlength(cs, startcode, options);        branchlength += find_minlength(cs, startcode, options);
389      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
390      break;      break;
391    
392      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
393      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
394      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. */
395    
396      case OP_UPTO:      case OP_UPTO:
397      case OP_NOTUPTO:      case OP_NOTUPTO:
398      case OP_MINUPTO:      case OP_MINUPTO:
399      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
400      case OP_POSUPTO:      case OP_POSUPTO:
401      case OP_STAR:      case OP_STAR:
402      case OP_MINSTAR:      case OP_MINSTAR:
403      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
404      case OP_POSSTAR:      case OP_POSSTAR:
405      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
406      case OP_QUERY:      case OP_QUERY:
407      case OP_MINQUERY:      case OP_MINQUERY:
408      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
409      case OP_POSQUERY:      case OP_POSQUERY:
410      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
411      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
412  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
413      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
414  #endif  #endif
415        break;
416    
417        /* Skip these, but we need to add in the name length. */
418    
419        case OP_MARK:
420        case OP_PRUNE_ARG:
421        case OP_SKIP_ARG:
422        case OP_THEN_ARG:
423        cc += _pcre_OP_lengths[op] + cc[1];
424      break;      break;
425    
426      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
427      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
428      OP_THEN. */      OP_THEN. */
429    
430      default:      default:
431      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
432      break;      break;
# Line 406  for (;;) Line 441  for (;;)
441  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
442  *************************************************/  *************************************************/
443    
444  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
445  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
446    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447    when Unicode property support is available.
448    
449  Arguments:  Arguments:
450    start_bits    points to the bit map    start_bits    points to the bit map
451    c             is the character    p             points to the character
452    caseless      the caseless flag    caseless      the caseless flag
453    cd            the block with char table pointers    cd            the block with char table pointers
454      utf8          TRUE for UTF-8 mode
455    
456    Returns:        pointer after the character
457    */
458    
459    static const uschar *
460    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
461      compile_data *cd, BOOL utf8)
462    {
463    unsigned int c = *p;
464    
465    SET_BIT(c);
466    
467    #ifdef SUPPORT_UTF8
468    if (utf8 && c > 127)
469      {
470      GETCHARINC(c, p);
471    #ifdef SUPPORT_UCP
472      if (caseless)
473        {
474        uschar buff[8];
475        c = UCD_OTHERCASE(c);
476        (void)_pcre_ord2utf8(c, buff);
477        SET_BIT(buff[0]);
478        }
479    #endif
480      return p;
481      }
482    #endif
483    
484    /* Not UTF-8 mode, or character is less than 127. */
485    
486    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
487    return p + 1;
488    }
489    
490    
491    
492    /*************************************************
493    *     Set bits for a positive character type     *
494    *************************************************/
495    
496    /* This function sets starting bits for a character type. In UTF-8 mode, we can
497    only do a direct setting for bytes less than 128, as otherwise there can be
498    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499    environment, the tables will only recognize ASCII characters anyway, but in at
500    least one Windows environment, some higher bytes bits were set in the tables.
501    So we deal with that case by considering the UTF-8 encoding.
502    
503  Returns:        nothing  Arguments:
504      start_bits     the starting bitmap
505      cbit type      the type of character wanted
506      table_limit    32 for non-UTF-8; 16 for UTF-8
507      cd             the block with char table pointers
508    
509    Returns:         nothing
510    */
511    
512    static void
513    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514      compile_data *cd)
515    {
516    register int c;
517    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518    if (table_limit == 32) return;
519    for (c = 128; c < 256; c++)
520      {
521      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522        {
523        uschar buff[8];
524        (void)_pcre_ord2utf8(c, buff);
525        SET_BIT(buff[0]);
526        }
527      }
528    }
529    
530    
531    /*************************************************
532    *     Set bits for a negative character type     *
533    *************************************************/
534    
535    /* This function sets starting bits for a negative character type such as \D.
536    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538    Unlike in the positive case, where we can set appropriate starting bits for
539    specific high-valued UTF-8 characters, in this case we have to set the bits for
540    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541    0xc0 (192) for simplicity.
542    
543    Arguments:
544      start_bits     the starting bitmap
545      cbit type      the type of character wanted
546      table_limit    32 for non-UTF-8; 16 for UTF-8
547      cd             the block with char table pointers
548    
549    Returns:         nothing
550  */  */
551    
552  static void  static void
553  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554      compile_data *cd)
555  {  {
556  start_bits[c/8] |= (1 << (c&7));  register int c;
557  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559  }  }
560    
561    
# Line 458  set_start_bits(const uschar *code, uscha Line 590  set_start_bits(const uschar *code, uscha
590  {  {
591  register int c;  register int c;
592  int yield = SSB_DONE;  int yield = SSB_DONE;
593    int table_limit = utf8? 16:32;
594    
595  #if 0  #if 0
596  /* ========================================================================= */  /* ========================================================================= */
# Line 581  do Line 714  do
714        case OP_QUERY:        case OP_QUERY:
715        case OP_MINQUERY:        case OP_MINQUERY:
716        case OP_POSQUERY:        case OP_POSQUERY:
717        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       tcode += 2;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
718        break;        break;
719    
720        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 594  do Line 722  do
722        case OP_UPTO:        case OP_UPTO:
723        case OP_MINUPTO:        case OP_MINUPTO:
724        case OP_POSUPTO:        case OP_POSUPTO:
725        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
       tcode += 4;  
 #ifdef SUPPORT_UTF8  
       if (utf8 && tcode[-1] >= 0xc0)  
         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];  
 #endif  
726        break;        break;
727    
728        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 612  do Line 735  do
735        case OP_PLUS:        case OP_PLUS:
736        case OP_MINPLUS:        case OP_MINPLUS:
737        case OP_POSPLUS:        case OP_POSPLUS:
738        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
739        try_next = FALSE;        try_next = FALSE;
740        break;        break;
741    
742        /* Single character type sets the bits and stops */        /* Special spacing and line-terminating items. These recognize specific
743          lists of characters. The difference between VSPACE and ANYNL is that the
744          latter can match the two-character CRLF sequence, but that is not
745          relevant for finding the first character, so their code here is
746          identical. */
747    
748          case OP_HSPACE:
749          SET_BIT(0x09);
750          SET_BIT(0x20);
751          if (utf8)
752            {
753            SET_BIT(0xC2);  /* For U+00A0 */
754            SET_BIT(0xE1);  /* For U+1680, U+180E */
755            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
756            SET_BIT(0xE3);  /* For U+3000 */
757            }
758          else SET_BIT(0xA0);
759          try_next = FALSE;
760          break;
761    
762          case OP_ANYNL:
763          case OP_VSPACE:
764          SET_BIT(0x0A);
765          SET_BIT(0x0B);
766          SET_BIT(0x0C);
767          SET_BIT(0x0D);
768          if (utf8)
769            {
770            SET_BIT(0xC2);  /* For U+0085 */
771            SET_BIT(0xE2);  /* For U+2028, U+2029 */
772            }
773          else SET_BIT(0x85);
774          try_next = FALSE;
775          break;
776    
777          /* Single character types set the bits and stop. Note that if PCRE_UCP
778          is set, we do not see these op codes because \d etc are converted to
779          properties. Therefore, these apply in the case when only characters less
780          than 256 are recognized to match the types. */
781    
782        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
783        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
784        try_next = FALSE;        try_next = FALSE;
785        break;        break;
786    
787        case OP_DIGIT:        case OP_DIGIT:
788        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
789        try_next = FALSE;        try_next = FALSE;
790        break;        break;
791    
792        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
793        discard it. */        ensure it is set as not whitespace. */
794    
795        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
796        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
798        try_next = FALSE;        try_next = FALSE;
799        break;        break;
800    
801        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
802        discard it. */        not set it from the table. */
803    
804        case OP_WHITESPACE:        case OP_WHITESPACE:
805        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
806          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
807          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
808        try_next = FALSE;        try_next = FALSE;
809        break;        break;
810    
811        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
812        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
813        try_next = FALSE;        try_next = FALSE;
814        break;        break;
815    
816        case OP_WORDCHAR:        case OP_WORDCHAR:
817        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
818        try_next = FALSE;        try_next = FALSE;
819        break;        break;
820    
# Line 673  do Line 823  do
823    
824        case OP_TYPEPLUS:        case OP_TYPEPLUS:
825        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
826          case OP_TYPEPOSPLUS:
827        tcode++;        tcode++;
828        break;        break;
829    
# Line 696  do Line 847  do
847        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
848        switch(tcode[1])        switch(tcode[1])
849          {          {
850            default:
851          case OP_ANY:          case OP_ANY:
852          case OP_ALLANY:          case OP_ALLANY:
853          return SSB_FAIL;          return SSB_FAIL;
854    
855            case OP_HSPACE:
856            SET_BIT(0x09);
857            SET_BIT(0x20);
858            if (utf8)
859              {
860              SET_BIT(0xC2);  /* For U+00A0 */
861              SET_BIT(0xE1);  /* For U+1680, U+180E */
862              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
863              SET_BIT(0xE3);  /* For U+3000 */
864              }
865            else SET_BIT(0xA0);
866            break;
867    
868            case OP_ANYNL:
869            case OP_VSPACE:
870            SET_BIT(0x0A);
871            SET_BIT(0x0B);
872            SET_BIT(0x0C);
873            SET_BIT(0x0D);
874            if (utf8)
875              {
876              SET_BIT(0xC2);  /* For U+0085 */
877              SET_BIT(0xE2);  /* For U+2028, U+2029 */
878              }
879            else SET_BIT(0x85);
880            break;
881    
882          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
883          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
884          break;          break;
885    
886          case OP_DIGIT:          case OP_DIGIT:
887          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
888          break;          break;
889    
890          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
891          discard it. */          ensure it gets set as not whitespace. */
892    
893          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
894          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
896          break;          break;
897    
898          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
899          discard it. */          avoid setting it. */
900    
901          case OP_WHITESPACE:          case OP_WHITESPACE:
902          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
903            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
904            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
905          break;          break;
906    
907          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
908          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
909          break;          break;
910    
911          case OP_WORDCHAR:          case OP_WORDCHAR:
912          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
913          break;          break;
914          }          }
915    
# Line 883  code = (uschar *)re + re->name_table_off Line 1051  code = (uschar *)re + re->name_table_off
1051    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1052    
1053  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
1054  a multiline pattern that matches only at "line starts", there is no point in  a multiline pattern that matches only at "line starts", there is no point in
1055  seeking a list of starting bytes. */  seeking a list of starting bytes. */
1056    
1057  if ((re->options & PCRE_ANCHORED) == 0 &&  if ((re->options & PCRE_ANCHORED) == 0 &&
1058      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1059    {    {
1060    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1061    
1062    tables = re->tables;    tables = re->tables;
1063    if (tables == NULL)    if (tables == NULL)
1064      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1065      (void *)(&tables));      (void *)(&tables));
1066    
1067    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1068    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
1069    compile_block.cbits = tables + cbits_offset;    compile_block.cbits = tables + cbits_offset;
1070    compile_block.ctypes = tables + ctypes_offset;    compile_block.ctypes = tables + ctypes_offset;
1071    
1072    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1073    
1074    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
1075    bits_set = set_start_bits(code, start_bits,    bits_set = set_start_bits(code, start_bits,
1076      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1077      &compile_block) == SSB_DONE;      &compile_block) == SSB_DONE;
1078    }    }
1079    
1080  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1081    
1082  min = find_minlength(code, code, re->options);  min = find_minlength(code, code, re->options);
# Line 945  if (bits_set) Line 1113  if (bits_set)
1113    study->flags |= PCRE_STUDY_MAPPED;    study->flags |= PCRE_STUDY_MAPPED;
1114    memcpy(study->start_bits, start_bits, sizeof(start_bits));    memcpy(study->start_bits, start_bits, sizeof(start_bits));
1115    }    }
1116    
1117  if (min >= 0)  if (min >= 0)
1118    {    {
1119    study->flags |= PCRE_STUDY_MINLEN;    study->flags |= PCRE_STUDY_MINLEN;
1120    study->minlength = min;    study->minlength = min;
1121    }    }
1122    
1123  return extra;  return extra;
1124  }  }

Legend:
Removed from v.455  
changed lines
  Added in v.545

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12