/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 605 by ph10, Fri Jun 3 18:18:30 2011 UTC revision 673 by ph10, Thu Aug 25 16:06:03 2011 UTC
# Line 66  string of that length that matches. In U Line 66  string of that length that matches. In U
66  rather than bytes.  rather than bytes.
67    
68  Arguments:  Arguments:
69    code       pointer to start of group (the bracket)    code        pointer to start of group (the bracket)
70    startcode  pointer to start of the whole pattern    startcode   pointer to start of the whole pattern
71    options    the compiling options    options     the compiling options
72      had_accept  pointer to flag for (*ACCEPT) encountered
73      int         RECURSE depth
74    
75  Returns:   the minimum length  Returns:   the minimum length
76             -1 if \C was encountered             -1 if \C was encountered
# Line 77  Returns: the minimum length Line 79  Returns: the minimum length
79  */  */
80    
81  static int  static int
82  find_minlength(const uschar *code, const uschar *startcode, int options)  find_minlength(const uschar *code, const uschar *startcode, int options,
83      BOOL *had_accept_ptr, int recurse_depth)
84  {  {
85  int length = -1;  int length = -1;
86  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
# Line 125  for (;;) Line 128  for (;;)
128      case OP_BRAPOS:      case OP_BRAPOS:
129      case OP_SBRAPOS:      case OP_SBRAPOS:
130      case OP_ONCE:      case OP_ONCE:
131      d = find_minlength(cc, startcode, options);      d = find_minlength(cc, startcode, options, had_accept_ptr, recurse_depth);
132      if (d < 0) return d;      if (d < 0) return d;
133      branchlength += d;      branchlength += d;
134        if (*had_accept_ptr) return branchlength;
135      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
136      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
137      break;      break;
138    
139      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
140      call. If it's ALT it is an alternation in a nested call. If it is      call. If it's ALT it is an alternation in a nested call. If it is END it's
141      END it's the end of the outer call. All can be handled by the same code. */      the end of the outer call. All can be handled by the same code. If it is
142        ACCEPT, it is essentially the same as END, but we set a flag so that
143        counting stops. */
144    
145        case OP_ACCEPT:
146        case OP_ASSERT_ACCEPT:
147        *had_accept_ptr = TRUE;
148        /* Fall through */
149      case OP_ALT:      case OP_ALT:
150      case OP_KET:      case OP_KET:
151      case OP_KETRMAX:      case OP_KETRMAX:
# Line 144  for (;;) Line 154  for (;;)
154      case OP_END:      case OP_END:
155      if (length < 0 || (!had_recurse && branchlength < length))      if (length < 0 || (!had_recurse && branchlength < length))
156        length = branchlength;        length = branchlength;
157      if (*cc != OP_ALT) return length;      if (op != OP_ALT) return length;
158      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
159      branchlength = 0;      branchlength = 0;
160      had_recurse = FALSE;      had_recurse = FALSE;
# Line 267  for (;;) Line 277  for (;;)
277      cc++;      cc++;
278      break;      break;
279    
280      /* "Any newline" might match two characters */      /* "Any newline" might match two characters, but it also might match just
281        one. */
282    
283      case OP_ANYNL:      case OP_ANYNL:
284      branchlength += 2;      branchlength += 1;
285      cc++;      cc++;
286      break;      break;
287    
# Line 366  for (;;) Line 377  for (;;)
377          d = 0;          d = 0;
378          had_recurse = TRUE;          had_recurse = TRUE;
379          }          }
380        else d = find_minlength(cs, startcode, options);        else
381            {
382            d = find_minlength(cs, startcode, options, had_accept_ptr,
383              recurse_depth);
384            *had_accept_ptr = FALSE;
385            }
386        }        }
387      else d = 0;      else d = 0;
388      cc += 3;      cc += 3;
# Line 403  for (;;) Line 419  for (;;)
419      branchlength += min * d;      branchlength += min * d;
420      break;      break;
421    
422        /* We can easily detect direct recursion, but not mutual recursion. This is
423        caught by a recursion depth count. */
424    
425      case OP_RECURSE:      case OP_RECURSE:
426      cs = ce = (uschar *)startcode + GET(cc, 1);      cs = ce = (uschar *)startcode + GET(cc, 1);
427      if (cs == NULL) return -2;      if (cs == NULL) return -2;
428      do ce += GET(ce, 1); while (*ce == OP_ALT);      do ce += GET(ce, 1); while (*ce == OP_ALT);
429      if (cc > cs && cc < ce)      if ((cc > cs && cc < ce) || recurse_depth > 10)
430        had_recurse = TRUE;        had_recurse = TRUE;
431      else      else
432        branchlength += find_minlength(cs, startcode, options);        {
433          branchlength += find_minlength(cs, startcode, options, had_accept_ptr,
434            recurse_depth + 1);
435          *had_accept_ptr = FALSE;
436          }
437      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
438      break;      break;
439    
# Line 481  for (;;) Line 504  for (;;)
504    
505      /* The remaining opcodes are just skipped over. */      /* The remaining opcodes are just skipped over. */
506    
     case OP_ACCEPT:  
507      case OP_CLOSE:      case OP_CLOSE:
508      case OP_COMMIT:      case OP_COMMIT:
509      case OP_FAIL:      case OP_FAIL:
# Line 687  do Line 709  do
709    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
710      {      {
711      int rc;      int rc;
712    
713      switch(*tcode)      switch(*tcode)
714        {        {
715        /* If we reach something we don't understand, it means a new opcode has        /* If we reach something we don't understand, it means a new opcode has
# Line 699  do Line 722  do
722        /* Fail for a valid opcode that implies no starting bits. */        /* Fail for a valid opcode that implies no starting bits. */
723    
724        case OP_ACCEPT:        case OP_ACCEPT:
725          case OP_ASSERT_ACCEPT:
726        case OP_ALLANY:        case OP_ALLANY:
727        case OP_ANY:        case OP_ANY:
728        case OP_ANYBYTE:        case OP_ANYBYTE:
729        case OP_CIRC:        case OP_CIRC:
730        case OP_CIRCM:        case OP_CIRCM:
731        case OP_CLOSE:        case OP_CLOSE:
732        case OP_COMMIT:        case OP_COMMIT:
733        case OP_COND:        case OP_COND:
734        case OP_CREF:        case OP_CREF:
735        case OP_DEF:        case OP_DEF:
736        case OP_DOLL:        case OP_DOLL:
737        case OP_DOLLM:        case OP_DOLLM:
738        case OP_END:        case OP_END:
739        case OP_EOD:        case OP_EOD:
740        case OP_EODN:        case OP_EODN:
741        case OP_EXTUNI:        case OP_EXTUNI:
742        case OP_FAIL:        case OP_FAIL:
743        case OP_MARK:        case OP_MARK:
# Line 721  do Line 745  do
745        case OP_NOT:        case OP_NOT:
746        case OP_NOTEXACT:        case OP_NOTEXACT:
747        case OP_NOTEXACTI:        case OP_NOTEXACTI:
748        case OP_NOTI:        case OP_NOTI:
749        case OP_NOTMINPLUS:        case OP_NOTMINPLUS:
750        case OP_NOTMINPLUSI:        case OP_NOTMINPLUSI:
751        case OP_NOTMINQUERY:        case OP_NOTMINQUERY:
# Line 749  do Line 773  do
773        case OP_NOTUPTOI:        case OP_NOTUPTOI:
774        case OP_NOT_HSPACE:        case OP_NOT_HSPACE:
775        case OP_NOT_VSPACE:        case OP_NOT_VSPACE:
       case OP_NOT_WORD_BOUNDARY:  
776        case OP_NRREF:        case OP_NRREF:
777        case OP_PROP:        case OP_PROP:
778        case OP_PRUNE:        case OP_PRUNE:
# Line 759  do Line 782  do
782        case OP_REFI:        case OP_REFI:
783        case OP_REVERSE:        case OP_REVERSE:
784        case OP_RREF:        case OP_RREF:
785        case OP_SCOND:        case OP_SCOND:
786        case OP_SET_SOM:        case OP_SET_SOM:
787        case OP_SKIP:        case OP_SKIP:
788        case OP_SKIP_ARG:        case OP_SKIP_ARG:
# Line 767  do Line 790  do
790        case OP_SOM:        case OP_SOM:
791        case OP_THEN:        case OP_THEN:
792        case OP_THEN_ARG:        case OP_THEN_ARG:
       case OP_WORD_BOUNDARY:  
793        case OP_XCLASS:        case OP_XCLASS:
794        return SSB_FAIL;        return SSB_FAIL;
795    
796          /* We can ignore word boundary tests. */
797    
798          case OP_WORD_BOUNDARY:
799          case OP_NOT_WORD_BOUNDARY:
800          tcode++;
801          break;
802    
803        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
804        bits from within the subpattern. If it can't find anything, we have to        bits from within the subpattern. If it can't find anything, we have to
805        give up. If it finds some mandatory character(s), we are done for this        give up. If it finds some mandatory character(s), we are done for this
# Line 1136  do Line 1165  do
1165            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1166            }            }
1167    
1168          /* Advance past the bit map, and act on what follows. For a zero          /* Advance past the bit map, and act on what follows. For a zero
1169          minimum repeat, continue; otherwise stop processing. */          minimum repeat, continue; otherwise stop processing. */
1170    
1171          tcode += 32;          tcode += 32;
# Line 1154  do Line 1183  do
1183            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1184              else try_next = FALSE;              else try_next = FALSE;
1185            break;            break;
1186    
1187            default:            default:
1188            try_next = FALSE;            try_next = FALSE;
1189            break;            break;
# Line 1199  pcre_study(const pcre *external_re, int Line 1228  pcre_study(const pcre *external_re, int
1228  {  {
1229  int min;  int min;
1230  BOOL bits_set = FALSE;  BOOL bits_set = FALSE;
1231    BOOL had_accept = FALSE;
1232  uschar start_bits[32];  uschar start_bits[32];
1233  pcre_extra *extra;  pcre_extra *extra;
1234  pcre_study_data *study;  pcre_study_data *study;
# Line 1256  if ((re->options & PCRE_ANCHORED) == 0 & Line 1286  if ((re->options & PCRE_ANCHORED) == 0 &
1286    
1287  /* Find the minimum length of subject string. */  /* Find the minimum length of subject string. */
1288    
1289  switch(min = find_minlength(code, code, re->options))  switch(min = find_minlength(code, code, re->options, &had_accept, 0))
1290    {    {
1291    case -2: *errorptr = "internal error: missing capturing bracket"; break;    case -2: *errorptr = "internal error: missing capturing bracket"; break;
1292    case -3: *errorptr = "internal error: opcode not recognized"; break;    case -3: *errorptr = "internal error: opcode not recognized"; break;
1293    default: break;    default: break;
1294    }    }
1295    
1296  /* Return NULL if there's been an error or if no optimization is possible. */  /* Return NULL if there's been an (internal) error or if no optimization is
1297    possible. A FALSE setting for bits_set is common when there are no obvious
1298    starting bytes. However a negative value of min occurs only when the pattern
1299    contains \C, in other words, it's an exceptional case nowadays. */
1300    
1301  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;  if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
1302    
# Line 1302  if (min >= 0) Line 1335  if (min >= 0)
1335    study->minlength = min;    study->minlength = min;
1336    }    }
1337    
1338    /* If JIT support was compiled and requested, attempt the JIT compilation. */
1339    
1340    extra->executable_jit = NULL;
1341    #ifdef SUPPORT_JIT
1342    if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
1343    #endif
1344    
1345  return extra;  return extra;
1346  }  }
1347    
1348    
1349    /*************************************************
1350    *          Free the study data                   *
1351    *************************************************/
1352    
1353    /* This function frees the memory that was obtained by pcre_study().
1354    
1355    Argument:   a pointer to the pcre_extra block
1356    Returns:    nothing
1357    */
1358    
1359    PCRE_EXP_DEFN void
1360    pcre_free_study(pcre_extra *extra)
1361    {
1362    #ifdef SUPPORT_JIT
1363    if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1364         extra->executable_jit != NULL)
1365      _pcre_jit_free(extra->executable_jit);
1366    #endif
1367    pcre_free(extra);
1368    }
1369    
1370  /* End of pcre_study.c */  /* End of pcre_study.c */

Legend:
Removed from v.605  
changed lines
  Added in v.673

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12