/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 383 by ph10, Sun Mar 8 15:26:59 2009 UTC revision 435 by ph10, Sat Sep 5 10:20:44 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language (but see  and semantics are as close as possible to those of the Perl 5 language (but see
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
# Line 61  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
# Line 390  if (*first_op == OP_REVERSE) Line 389  if (*first_op == OP_REVERSE)
389        current_subject - start_subject : max_back;        current_subject - start_subject : max_back;
390      current_subject -= gone_back;      current_subject -= gone_back;
391      }      }
392    
393      /* Save the earliest consulted character */
394    
395      if (current_subject < md->start_used_ptr)
396        md->start_used_ptr = current_subject;
397    
398    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
399    
# Line 455  for (;;) Line 459  for (;;)
459    int i, j;    int i, j;
460    int clen, dlen;    int clen, dlen;
461    unsigned int c, d;    unsigned int c, d;
462      int forced_fail = 0;
463      int reached_end = 0;
464    
465    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
466    new state list. */    new state list. */
# Line 512  for (;;) Line 518  for (;;)
518      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
519      const uschar *code;      const uschar *code;
520      int state_offset = current_state->offset;      int state_offset = current_state->offset;
521      int count, codevalue;      int count, codevalue, rrc;
522    
523  #ifdef DEBUG  #ifdef DEBUG
524      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 625  for (;;) Line 631  for (;;)
631            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
632            }            }
633          }          }
634        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
635          {          {
636          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
637            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
638              match_count = 0;            {
639          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
640          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
641          if (offsetcount >= 2)                match_count = 0;
642            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
643            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
644            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
645            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
646              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
647            }              offsets[1] = ptr - start_subject;
648          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
649            {                offsets[1] - offsets[0], current_subject));
650            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
651              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
652              match_count, rlevel*2-2, SP));              {
653            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
654            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
655                  match_count, rlevel*2-2, SP));
656                return match_count;
657                }
658              }
659          }          }
660        break;        break;
661    
# Line 795  for (;;) Line 805  for (;;)
805          if (ptr > start_subject)          if (ptr > start_subject)
806            {            {
807            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
808              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
809  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
810            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
811  #endif  #endif
# Line 803  for (;;) Line 814  for (;;)
814            }            }
815          else left_word = 0;          else left_word = 0;
816    
817          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
818            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
819            else              /* This is a fudge to ensure that if this is the */
820              {               /* last item in the pattern, we don't count it as */
821              reached_end--;  /* reached, thus disabling a partial match. */
822              right_word = 0;
823              }
824    
825          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
826            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 2158  for (;;) Line 2174  for (;;)
2174    
2175  /* ========================================================================== */  /* ========================================================================== */
2176        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2177        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2178        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2179        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2180    
2181        case OP_FAIL:        case OP_FAIL:
2182          forced_fail++;    /* Count FAILs for multiple states */
2183        break;        break;
2184    
2185        case OP_ASSERT:        case OP_ASSERT:
# Line 2201  for (;;) Line 2218  for (;;)
2218          {          {
2219          int local_offsets[1000];          int local_offsets[1000];
2220          int local_workspace[1000];          int local_workspace[1000];
2221          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2222            int condcode;
2223    
2224            /* Because of the way auto-callout works during compile, a callout item
2225            is inserted between OP_COND and an assertion condition. This does not
2226            happen for the other conditions. */
2227    
2228            if (code[LINK_SIZE+1] == OP_CALLOUT)
2229              {
2230              rrc = 0;
2231              if (pcre_callout != NULL)
2232                {
2233                pcre_callout_block cb;
2234                cb.version          = 1;   /* Version 1 of the callout block */
2235                cb.callout_number   = code[LINK_SIZE+2];
2236                cb.offset_vector    = offsets;
2237                cb.subject          = (PCRE_SPTR)start_subject;
2238                cb.subject_length   = end_subject - start_subject;
2239                cb.start_match      = current_subject - start_subject;
2240                cb.current_position = ptr - start_subject;
2241                cb.pattern_position = GET(code, LINK_SIZE + 3);
2242                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2243                cb.capture_top      = 1;
2244                cb.capture_last     = -1;
2245                cb.callout_data     = md->callout_data;
2246                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2247                }
2248              if (rrc > 0) break;                      /* Fail this thread */
2249              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2250              }
2251    
2252            condcode = code[LINK_SIZE+1];
2253    
2254          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2255    
# Line 2210  for (;;) Line 2258  for (;;)
2258          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2259    
2260          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2261            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2262    
2263          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2264          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 2222  for (;;) Line 2268  for (;;)
2268            {            {
2269            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2270            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2271            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2272              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2273              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2274            }            }
2275    
2276          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2253  for (;;) Line 2300  for (;;)
2300                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2301              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2302            else            else
2303              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304            }            }
2305          }          }
2306        break;        break;
# Line 2405  for (;;) Line 2452  for (;;)
2452        /* Handle callouts */        /* Handle callouts */
2453    
2454        case OP_CALLOUT:        case OP_CALLOUT:
2455          rrc = 0;
2456        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2457          {          {
         int rrc;  
2458          pcre_callout_block cb;          pcre_callout_block cb;
2459          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2460          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2422  for (;;) Line 2469  for (;;)
2469          cb.capture_last     = -1;          cb.capture_last     = -1;
2470          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2471          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2472          }          }
2473          if (rrc == 0)
2474            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2475        break;        break;
2476    
2477    
# Line 2439  for (;;) Line 2487  for (;;)
2487    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2488    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2489    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2490    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2491      fail" variable counts the number of (*F) encountered for the character. If it
2492      is equal to the original active_count (saved in workspace[1]) it means that
2493      (*F) was found on every active state. In this case we don't want to give a
2494      partial match. */
2495    
2496    if (new_count <= 0)    if (new_count <= 0)
2497      {      {
2498      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2499          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2500          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2501          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2502          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2503            ||                                           /* or... */
2504            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2505             match_count < 0)                            /* no matches */
2506            ) &&                                         /* And... */
2507            ptr >= end_subject &&                     /* Reached end of subject */
2508            ptr > current_subject)                    /* Matched non-empty string */
2509        {        {
2510        if (offsetcount >= 2)        if (offsetcount >= 2)
2511          {          {
2512          offsets[0] = current_subject - start_subject;          offsets[0] = md->start_used_ptr - start_subject;
2513          offsets[1] = end_subject - start_subject;          offsets[1] = end_subject - start_subject;
2514          }          }
2515        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
# Line 2615  switch ((((options & PCRE_NEWLINE_BITS) Line 2673  switch ((((options & PCRE_NEWLINE_BITS)
2673           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2674    {    {
2675    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2676    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2677    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2678    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2679         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2680    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2681    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2682    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2714  if ((re->flags & PCRE_REQCHSET) != 0) Line 2772  if ((re->flags & PCRE_REQCHSET) != 0)
2772    }    }
2773    
2774  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2775  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2776  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2777    
2778  for (;;)  for (;;)
2779    {    {
# Line 2726  for (;;) Line 2783  for (;;)
2783      {      {
2784      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2785    
2786      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2787      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2788      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2789      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2790    
2791      if (firstline)      if (firstline)
2792        {        {
# Line 2750  for (;;) Line 2806  for (;;)
2806        end_subject = t;        end_subject = t;
2807        }        }
2808    
2809      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2810        starting point is not found, or if a known later character is not present.
2811        However, there is an option that disables these, for testing and for
2812        ensuring that all callouts do actually occur. */
2813    
2814        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2815        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2816    
2817      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2818    
2819      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2820          {          {
2821  #ifdef SUPPORT_UTF8          if (first_byte_caseless)
2822          if (utf8)            while (current_subject < end_subject &&
2823                     lcc[*current_subject] != first_byte)
2824                current_subject++;
2825            else
2826              while (current_subject < end_subject &&
2827                     *current_subject != first_byte)
2828                current_subject++;
2829            }
2830    
2831          /* Or to just after a linebreak for a multiline match if possible */
2832    
2833          else if (startline)
2834            {
2835            if (current_subject > md->start_subject + start_offset)
2836            {            {
2837            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))  #ifdef SUPPORT_UTF8
2838              if (utf8)
2839              {              {
2840              current_subject++;              while (current_subject < end_subject &&
2841              while(current_subject < end_subject &&                     !WAS_NEWLINE(current_subject))
2842                    (*current_subject & 0xc0) == 0x80)                {
2843                current_subject++;                current_subject++;
2844                  while(current_subject < end_subject &&
2845                        (*current_subject & 0xc0) == 0x80)
2846                    current_subject++;
2847                  }
2848              }              }
2849            }            else
         else  
2850  #endif  #endif
2851          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))            while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2852            current_subject++;              current_subject++;
2853    
2854          /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
2855          ANYCRLF, and we are now at a LF, advance the match position by one more            ANYCRLF, and we are now at a LF, advance the match position by one
2856          character. */            more character. */
2857    
2858          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
2859               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2860               current_subject < end_subject &&                 current_subject < end_subject &&
2861               *current_subject == '\n')                 *current_subject == CHAR_NL)
2862            current_subject++;              current_subject++;
2863              }
2864          }          }
       }  
2865    
2866      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2867    
2868      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2869          {          {
2870          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2871          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2872            else break;            register unsigned int c = *current_subject;
2873              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2874                else break;
2875              }
2876          }          }
2877        }        }
2878    
# Line 2825  for (;;) Line 2894  for (;;)
2894    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2895    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2896    
2897    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2898    */    also be explicitly deactivated. Furthermore, we have to disable when
2899      restarting after a partial match, because the required character may have
2900      already been matched. */
2901    
2902    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2903          req_byte >= 0 &&
2904        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2905        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2906      {      {
2907      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2908    
# Line 2870  for (;;) Line 2942  for (;;)
2942    
2943    /* OK, now we can do the business */    /* OK, now we can do the business */
2944    
2945      md->start_used_ptr = current_subject;
2946    
2947    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2948      md,                                /* fixed match data */      md,                                /* fixed match data */
2949      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2904  for (;;) Line 2978  for (;;)
2978    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2979    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2980    
2981    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2982        current_subject < end_subject &&        current_subject < end_subject &&
2983        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2984        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2985          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2986           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.383  
changed lines
  Added in v.435

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12