/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 460 by ph10, Sun Oct 4 09:21:39 2009 UTC revision 461 by ph10, Mon Oct 5 10:59:35 2009 UTC
# Line 60  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE Line 60  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE
60  *************************************************/  *************************************************/
61    
62  /* Scan a parenthesized group and compute the minimum length of subject that  /* Scan a parenthesized group and compute the minimum length of subject that
63  is needed to match it. This is a lower bound; it does not mean there is a  is needed to match it. This is a lower bound; it does not mean there is a
64  string of that length that matches. In UTF8 mode, the result is in characters  string of that length that matches. In UTF8 mode, the result is in characters
65  rather than bytes.  rather than bytes.
66    
67  Arguments:  Arguments:
68    code       pointer to start of group (the bracket)    code       pointer to start of group (the bracket)
69    startcode  pointer to start of the whole pattern    startcode  pointer to start of the whole pattern
70    options    the compiling options    options    the compiling options
71    
72  Returns:   the minimum length  Returns:   the minimum length
73             -1 if \C was encountered             -1 if \C was encountered
74             -2 internal error (missing capturing bracket)             -2 internal error (missing capturing bracket)
75  */  */
76    
77  static int  static int
# Line 91  branch, check the length against that of Line 91  branch, check the length against that of
91  for (;;)  for (;;)
92    {    {
93    int d, min;    int d, min;
94    uschar *cs, *ce;    uschar *cs, *ce;
95    register int op = *cc;    register int op = *cc;
96    
97    switch (op)    switch (op)
98      {      {
99      case OP_CBRA:      case OP_CBRA:
100      case OP_SCBRA:      case OP_SCBRA:
101      case OP_BRA:      case OP_BRA:
102      case OP_SBRA:      case OP_SBRA:
103      case OP_ONCE:      case OP_ONCE:
104      case OP_COND:      case OP_COND:
105      case OP_SCOND:      case OP_SCOND:
106      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options);
107      if (d < 0) return d;      if (d < 0) return d;
108      branchlength += d;      branchlength += d;
# Line 119  for (;;) Line 119  for (;;)
119      case OP_KETRMAX:      case OP_KETRMAX:
120      case OP_KETRMIN:      case OP_KETRMIN:
121      case OP_END:      case OP_END:
122      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
123        length = branchlength;        length = branchlength;
124      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
125      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
126      branchlength = 0;      branchlength = 0;
127      had_recurse = FALSE;      had_recurse = FALSE;
128      break;      break;
129    
130      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
# Line 156  for (;;) Line 156  for (;;)
156      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
157      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
158      break;      break;
159    
160      /* Skip over a subpattern that has a {0} or {0,x} quantifier */      /* Skip over a subpattern that has a {0} or {0,x} quantifier */
161    
162      case OP_BRAZERO:      case OP_BRAZERO:
163      case OP_BRAMINZERO:      case OP_BRAMINZERO:
164      case OP_SKIPZERO:      case OP_SKIPZERO:
165      cc += _pcre_OP_lengths[*cc];      cc += _pcre_OP_lengths[*cc];
166      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 184  for (;;) Line 184  for (;;)
184      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
185  #endif  #endif
186      break;      break;
187    
188      case OP_TYPEPLUS:      case OP_TYPEPLUS:
189      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
190      case OP_TYPEPOSPLUS:      case OP_TYPEPOSPLUS:
191      branchlength++;      branchlength++;
192      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;      cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
193      break;      break;
# Line 196  for (;;) Line 196  for (;;)
196      need to skip over a multibyte character in UTF8 mode.  */      need to skip over a multibyte character in UTF8 mode.  */
197    
198      case OP_EXACT:      case OP_EXACT:
199      case OP_NOTEXACT:      case OP_NOTEXACT:
200      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
201      cc += 4;      cc += 4;
202  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 225  for (;;) Line 225  for (;;)
225      case OP_ANY:      case OP_ANY:
226      case OP_ALLANY:      case OP_ALLANY:
227      case OP_EXTUNI:      case OP_EXTUNI:
228      case OP_HSPACE:      case OP_HSPACE:
229      case OP_NOT_HSPACE:      case OP_NOT_HSPACE:
230      case OP_VSPACE:      case OP_VSPACE:
231      case OP_NOT_VSPACE:      case OP_NOT_VSPACE:
232      branchlength++;      branchlength++;
233      cc++;      cc++;
234      break;      break;
235    
236      /* "Any newline" might match two characters */      /* "Any newline" might match two characters */
237    
238      case OP_ANYNL:      case OP_ANYNL:
239      branchlength += 2;      branchlength += 2;
240      cc++;      cc++;
241      break;      break;
242    
243      /* The single-byte matcher means we can't proceed in UTF-8 mode */      /* The single-byte matcher means we can't proceed in UTF-8 mode */
244    
# Line 248  for (;;) Line 248  for (;;)
248  #endif  #endif
249      branchlength++;      branchlength++;
250      cc++;      cc++;
251      break;      break;
252    
253      /* For repeated character types, we have to test for \p and \P, which have      /* For repeated character types, we have to test for \p and \P, which have
254      an extra two bytes of parameters. */      an extra two bytes of parameters. */
# Line 287  for (;;) Line 287  for (;;)
287        case OP_CRPLUS:        case OP_CRPLUS:
288        case OP_CRMINPLUS:        case OP_CRMINPLUS:
289        branchlength++;        branchlength++;
290        /* Fall through */        /* Fall through */
291    
292        case OP_CRSTAR:        case OP_CRSTAR:
293        case OP_CRMINSTAR:        case OP_CRMINSTAR:
294        case OP_CRQUERY:        case OP_CRQUERY:
295        case OP_CRMINQUERY:        case OP_CRMINQUERY:
296        cc++;        cc++;
297        break;        break;
298    
299        case OP_CRRANGE:        case OP_CRRANGE:
300        case OP_CRMINRANGE:        case OP_CRMINRANGE:
301        branchlength += GET2(cc,1);        branchlength += GET2(cc,1);
302        cc += 5;        cc += 5;
303        break;        break;
304    
305        default:        default:
306        branchlength++;        branchlength++;
307        break;        break;
308        }        }
309      break;      break;
310    
311      /* Backreferences and subroutine calls are treated in the same way: we find      /* Backreferences and subroutine calls are treated in the same way: we find
312      the minimum length for the subpattern. A recursion, however, causes an      the minimum length for the subpattern. A recursion, however, causes an
313      a flag to be set that causes the length of this branch to be ignored. The      a flag to be set that causes the length of this branch to be ignored. The
314      logic is that a recursion can only make sense if there is another      logic is that a recursion can only make sense if there is another
315      alternation that stops the recursing. That will provide the minimum length      alternation that stops the recursing. That will provide the minimum length
316      (when no recursion happens). A backreference within the group that it is      (when no recursion happens). A backreference within the group that it is
317      referencing behaves in the same way. */      referencing behaves in the same way. */
318    
319      case OP_REF:      case OP_REF:
320      ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));      ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
321      if (cs == NULL) return -2;      if (cs == NULL) return -2;
# Line 323  for (;;) Line 323  for (;;)
323      if (cc > cs && cc < ce)      if (cc > cs && cc < ce)
324        {        {
325        d = 0;        d = 0;
326        had_recurse = TRUE;        had_recurse = TRUE;
327        }        }
328      else d = find_minlength(cs, startcode, options);      else d = find_minlength(cs, startcode, options);
329      cc += 3;      cc += 3;
330    
331      /* Handle repeated back references */      /* Handle repeated back references */
332    
333      switch (*cc)      switch (*cc)
334        {        {
335        case OP_CRSTAR:        case OP_CRSTAR:
# Line 339  for (;;) Line 339  for (;;)
339        min = 0;        min = 0;
340        cc++;        cc++;
341        break;        break;
342    
343        case OP_CRRANGE:        case OP_CRRANGE:
344        case OP_CRMINRANGE:        case OP_CRMINRANGE:
345        min = GET2(cc, 1);        min = GET2(cc, 1);
346        cc += 5;        cc += 5;
347        break;        break;
348    
349        default:        default:
350        min = 1;        min = 1;
351        break;        break;
352        }        }
353    
354      branchlength += min * d;      branchlength += min * d;
355      break;      break;
356    
357      case OP_RECURSE:      case OP_RECURSE:
358      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (uschar *)startcode + GET(cc, 1);
359      if (cs == NULL) return -2;      if (cs == NULL) return -2;
360      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
361      if (cc > cs && cc < ce)      if (cc > cs && cc < ce)
362        had_recurse = TRUE;        had_recurse = TRUE;
363      else      else
364        branchlength += find_minlength(cs, startcode, options);        branchlength += find_minlength(cs, startcode, options);
365      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
366      break;      break;
367    
368      /* Anything else does not or need not match a character. We can get the      /* Anything else does not or need not match a character. We can get the
369      item's length from the table, but for those that can match zero occurrences      item's length from the table, but for those that can match zero occurrences
370      of a character, we must take special action for UTF-8 characters. */      of a character, we must take special action for UTF-8 characters. */
371    
372      case OP_UPTO:      case OP_UPTO:
373      case OP_NOTUPTO:      case OP_NOTUPTO:
374      case OP_MINUPTO:      case OP_MINUPTO:
375      case OP_NOTMINUPTO:      case OP_NOTMINUPTO:
376      case OP_POSUPTO:      case OP_POSUPTO:
377      case OP_STAR:      case OP_STAR:
378      case OP_MINSTAR:      case OP_MINSTAR:
379      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
380      case OP_POSSTAR:      case OP_POSSTAR:
381      case OP_NOTPOSSTAR:      case OP_NOTPOSSTAR:
382      case OP_QUERY:      case OP_QUERY:
383      case OP_MINQUERY:      case OP_MINQUERY:
384      case OP_NOTMINQUERY:      case OP_NOTMINQUERY:
385      case OP_POSQUERY:      case OP_POSQUERY:
386      case OP_NOTPOSQUERY:      case OP_NOTPOSQUERY:
387      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
388  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
389      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
390  #endif  #endif
391      break;      break;
392    
393      /* For the record, these are the opcodes that are matched by "default":      /* For the record, these are the opcodes that are matched by "default":
394      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
395      OP_THEN. */      OP_THEN. */
396    
397      default:      default:
398      cc += _pcre_OP_lengths[op];      cc += _pcre_OP_lengths[op];
399      break;      break;
# Line 885  code = (uschar *)re + re->name_table_off Line 885  code = (uschar *)re + re->name_table_off
885    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
886    
887  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
888  a multiline pattern that matches only at "line starts", there is no point in  a multiline pattern that matches only at "line starts", there is no point in
889  seeking a list of starting bytes. */  seeking a list of starting bytes. */
890    
891  if ((re->options & PCRE_ANCHORED) == 0 &&  if ((re->options & PCRE_ANCHORED) == 0 &&
892      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
893    {    {
894    /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
895    
896    tables = re->tables;    tables = re->tables;
897    if (tables == NULL)    if (tables == NULL)
898      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
899      (void *)(&tables));      (void *)(&tables));
900    
901    compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
902    compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
903    compile_block.cbits = tables + cbits_offset;    compile_block.cbits = tables + cbits_offset;
904    compile_block.ctypes = tables + ctypes_offset;    compile_block.ctypes = tables + ctypes_offset;
905    
906    /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
907    
908    memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
909    bits_set = set_start_bits(code, start_bits,    bits_set = set_start_bits(code, start_bits,
910      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,      (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
911      &compile_block) == SSB_DONE;      &compile_block) == SSB_DONE;
912    }    }
913    
914  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
915    
916  min = find_minlength(code, code, re->options);  min = find_minlength(code, code, re->options);
# Line 947  if (bits_set) Line 947  if (bits_set)
947    study->flags |= PCRE_STUDY_MAPPED;    study->flags |= PCRE_STUDY_MAPPED;
948    memcpy(study->start_bits, start_bits, sizeof(start_bits));    memcpy(study->start_bits, start_bits, sizeof(start_bits));
949    }    }
950    
951  if (min >= 0)  if (min >= 0)
952    {    {
953    study->flags |= PCRE_STUDY_MINLEN;    study->flags |= PCRE_STUDY_MINLEN;
954    study->minlength = min;    study->minlength = min;
955    }    }
956    
957  return extra;  return extra;
958  }  }

Legend:
Removed from v.460  
changed lines
  Added in v.461

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12