/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 459 by ph10, Sun Oct 4 09:21:39 2009 UTC revision 469 by ph10, Mon Oct 19 14:38:48 2009 UTC
# Line 60  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE Line 60  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE
60  *************************************************/  *************************************************/
61    
62  /* Scan a parenthesized group and compute the minimum length of subject that  /* Scan a parenthesized group and compute the minimum length of subject that
63  is needed to match it. This is a lower bound; it does not mean there is a  is needed to match it. This is a lower bound; it does not mean there is a
64  string of that length that matches. In UTF8 mode, the result is in characters  string of that length that matches. In UTF8 mode, the result is in characters
65  rather than bytes.  rather than bytes.
66    
67  Arguments:  Arguments:
68    code       pointer to start of group (the bracket)    code       pointer to start of group (the bracket)
69    startcode  pointer to start of the whole pattern    startcode  pointer to start of the whole pattern
70    options    the compiling options    options    the compiling options
71    
72  Returns:   the minimum length  Returns:   the minimum length
73             -1 if \C was encountered             -1 if \C was encountered
74             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
75  */  */
76    
77  static int  static int
# Line 91  branch, check the length against that of Line 91  branch, check the length against that of
91  for (;;)  for (;;)
92    {    {
93    int d, min;    int d, min;
94    uschar *cs, *ce;    uschar *cs, *ce;
95    register int op = *cc;    register int op = *cc;
96    
97    switch (op)    switch (op)
98      {      {
99      case OP_CBRA:      case OP_CBRA:
100      case OP_SCBRA:      case OP_SCBRA:
101      case OP_BRA:      case OP_BRA:
102      case OP_SBRA:      case OP_SBRA:
103      case OP_ONCE:      case OP_ONCE:
104      case OP_COND:      case OP_COND:
105      case OP_SCOND:      case OP_SCOND:
106      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options);
107      if (d < 0) return d;      if (d < 0) return d;
108      branchlength += d;      branchlength += d;
# Line 119  for (;;) Line 119  for (;;)
119      case OP_KETRMAX:      case OP_KETRMAX:
120      case OP_KETRMIN:      case OP_KETRMIN:
121      case OP_END:      case OP_END:
122      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
123        length = branchlength;        length = branchlength;
124      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
125      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
126      branchlength = 0;      branchlength = 0;
127      had_recurse = FALSE;      had_recurse = FALSE;
128      break;      break;
129    
130      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
# Line 156  for (;;) Line 156  for (;;)
156      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
157      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
158      break;      break;
159    
160      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
161    
162      case OP_BRAZERO:      case OP_BRAZERO:
163      case OP_BRAMINZERO:      case OP_BRAMINZERO:
164      case OP_SKIPZERO:      case OP_SKIPZERO:
165      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
166      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 184  for (;;) Line 184  for (;;)
184      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
185  #endif  #endif
186      break;      break;
187    
188      case OP_TYPEPLUS:      case OP_TYPEPLUS:
189      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
190      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
191      branchlength++;      branchlength++;
192      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
193      break;      break;
# Line 196  for (;;) Line 196  for (;;)
196      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
197    
198      case OP_EXACT:      case OP_EXACT:
199      case OP_NOTEXACT:      case OP_NOTEXACT:
200      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
201      cc += 4;      cc += 4;
202  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 225  for (;;) Line 225  for (;;)
225      case OP_ANY:      case OP_ANY:
226      case OP_ALLANY:      case OP_ALLANY:
227      case OP_EXTUNI:      case OP_EXTUNI:
228      case OP_HSPACE:      case OP_HSPACE:
229      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
230      case OP_VSPACE:      case OP_VSPACE:
231      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
232      branchlength++;      branchlength++;
233      cc++;      cc++;
234      break;      break;
235    
236      /* "Any newline" might match two characters */      /* "Any newline" might match two characters */
237    
238      case OP_ANYNL:      case OP_ANYNL:
239      branchlength += 2;      branchlength += 2;
240      cc++;      cc++;
241      break;      break;
242    
243      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode */
244    
# Line 248  for (;;) Line 248  for (;;)
248  #endif  #endif
249      branchlength++;      branchlength++;
250      cc++;      cc++;
251      break;      break;
252    
253      /* For repeated character types, we have to test for \p and \P, which have      /* For repeated character types, we have to test for \p and \P, which have
254      an extra two bytes of parameters. */      an extra two bytes of parameters. */
# Line 287  for (;;) Line 287  for (;;)
287        case OP_CRPLUS:        case OP_CRPLUS:
288        case OP_CRMINPLUS:        case OP_CRMINPLUS:
289        branchlength++;        branchlength++;
290        /* Fall through */        /* Fall through */
291    
292        case OP_CRSTAR:        case OP_CRSTAR:
293        case OP_CRMINSTAR:        case OP_CRMINSTAR:
294        case OP_CRQUERY:        case OP_CRQUERY:
295        case OP_CRMINQUERY:        case OP_CRMINQUERY:
296        cc++;        cc++;
297        break;        break;
298    
299        case OP_CRRANGE:        case OP_CRRANGE:
300        case OP_CRMINRANGE:        case OP_CRMINRANGE:
301        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
302        cc += 5;        cc += 5;
303        break;        break;
304    
305        default:        default:
306        branchlength++;        branchlength++;
307        break;        break;
308        }        }
309      break;      break;
310    
311      /* Backreferences and subroutine calls are treated in the same way: we find      /* Backreferences and subroutine calls are treated in the same way: we find
312      the minimum length for the subpattern. A recursion, however, causes an      the minimum length for the subpattern. A recursion, however, causes an
313      a flag to be set that causes the length of this branch to be ignored. The      a flag to be set that causes the length of this branch to be ignored. The
314      logic is that a recursion can only make sense if there is another      logic is that a recursion can only make sense if there is another
315      alternation that stops the recursing. That will provide the minimum length      alternation that stops the recursing. That will provide the minimum length
316      (when no recursion happens). A backreference within the group that it is      (when no recursion happens). A backreference within the group that it is
317      referencing behaves in the same way. */      referencing behaves in the same way.
318    
319        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
320        matches an empty string (by default it causes a matching failure), so in
321        that case we must set the minimum length to zero. */
322    
323      case OP_REF:      case OP_REF:
324      ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));      if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
     if (cs == NULL) return -2;  
     do ce += GET(ce, 1); while (*ce == OP_ALT);  
     if (cc > cs && cc < ce)  
325        {        {
326        d = 0;        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
327        had_recurse = TRUE;        if (cs == NULL) return -2;
328        }        do ce += GET(ce, 1); while (*ce == OP_ALT);
329      else d = find_minlength(cs, startcode, options);        if (cc > cs && cc < ce)
330      cc += 3;          {
331            d = 0;
332            had_recurse = TRUE;
333            }
334          else d = find_minlength(cs, startcode, options);
335          }
336        else d = 0;
337        cc += 3;
338    
339      /* Handle repeated back references */      /* Handle repeated back references */
340    
341      switch (*cc)      switch (*cc)
342        {        {
343        case OP_CRSTAR:        case OP_CRSTAR:
# Line 339  for (;;) Line 347  for (;;)
347        min = 0;        min = 0;
348        cc++;        cc++;
349        break;        break;
350    
351        case OP_CRRANGE:        case OP_CRRANGE:
352        case OP_CRMINRANGE:        case OP_CRMINRANGE:
353        min = GET2(cc, 1);        min = GET2(cc, 1);
354        cc += 5;        cc += 5;
355        break;        break;
356    
357        default:        default:
358        min = 1;        min = 1;
359        break;        break;
360        }        }
361    
362      branchlength += min * d;      branchlength += min * d;
363      break;      break;
364    
365      case OP_RECURSE:      case OP_RECURSE:
366      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (uschar *)startcode + GET(cc, 1);
367      if (cs == NULL) return -2;      if (cs == NULL) return -2;
368      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
369      if (cc > cs && cc < ce)      if (cc > cs && cc < ce)
370        had_recurse = TRUE;        had_recurse = TRUE;
371      else      else
372        branchlength += find_minlength(cs, startcode, options);        branchlength += find_minlength(cs, startcode, options);
373      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
374      break;      break;
375    
376      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
377      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
378      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. */
379    
380      case OP_UPTO:      case OP_UPTO:
381      case OP_NOTUPTO:      case OP_NOTUPTO:
382      case OP_MINUPTO:      case OP_MINUPTO:
383      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
384      case OP_POSUPTO:      case OP_POSUPTO:
385      case OP_STAR:      case OP_STAR:
386      case OP_MINSTAR:      case OP_MINSTAR:
387      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
388      case OP_POSSTAR:      case OP_POSSTAR:
389      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
390      case OP_QUERY:      case OP_QUERY:
391      case OP_MINQUERY:      case OP_MINQUERY:
392      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
393      case OP_POSQUERY:      case OP_POSQUERY:
394      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
395      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
396  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
397      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
398  #endif  #endif
399      break;      break;
400    
401      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
402      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
403      OP_THEN. */      OP_THEN. */
404    
405      default:      default:
406      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
407      break;      break;
# Line 885  code = (uschar *)re + re->name_table_off Line 893  code = (uschar *)re + re->name_table_off
893    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
894    
895  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
896  a multiline pattern that matches only at "line starts", there is no point in  a multiline pattern that matches only at "line starts", there is no point in
897  seeking a list of starting bytes. */  seeking a list of starting bytes. */
898    
899  if ((re->options & PCRE_ANCHORED) == 0 &&  if ((re->options & PCRE_ANCHORED) == 0 &&
900      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
901    {    {
902    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
903    
904    tables = re->tables;    tables = re->tables;
905    if (tables == NULL)    if (tables == NULL)
906      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
907      (void *)(&tables));      (void *)(&tables));
908    
909    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
910    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
911    compile_block.cbits = tables + cbits_offset;    compile_block.cbits = tables + cbits_offset;
912    compile_block.ctypes = tables + ctypes_offset;    compile_block.ctypes = tables + ctypes_offset;
913    
914    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
915    
916    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
917    bits_set = set_start_bits(code, start_bits,    bits_set = set_start_bits(code, start_bits,
918      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
919      &compile_block) == SSB_DONE;      &compile_block) == SSB_DONE;
920    }    }
921    
922  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
923    
924  min = find_minlength(code, code, re->options);  min = find_minlength(code, code, re->options);
# Line 947  if (bits_set) Line 955  if (bits_set)
955    study->flags |= PCRE_STUDY_MAPPED;    study->flags |= PCRE_STUDY_MAPPED;
956    memcpy(study->start_bits, start_bits, sizeof(start_bits));    memcpy(study->start_bits, start_bits, sizeof(start_bits));
957    }    }
958    
959  if (min >= 0)  if (min >= 0)
960    {    {
961    study->flags |= PCRE_STUDY_MINLEN;    study->flags |= PCRE_STUDY_MINLEN;
962    study->minlength = min;    study->minlength = min;
963    }    }
964    
965  return extra;  return extra;
966  }  }

Legend:
Removed from v.459  
changed lines
  Added in v.469

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12