/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 359 by ph10, Wed Jul 9 16:20:19 2008 UTC revision 402 by ph10, Sat Mar 21 17:26:03 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 334  typedef struct heapframe { Line 334  typedef struct heapframe {
334    /* Function local variables */    /* Function local variables */
335    
336    const uschar *Xcallpat;    const uschar *Xcallpat;
337    #ifdef SUPPORT_UTF8
338    const uschar *Xcharptr;    const uschar *Xcharptr;
339    #endif
340    const uschar *Xdata;    const uschar *Xdata;
341    const uschar *Xnext;    const uschar *Xnext;
342    const uschar *Xpp;    const uschar *Xpp;
# Line 561  int oclength; Line 563  int oclength;
563  uschar occhars[8];  uschar occhars[8];
564  #endif  #endif
565    
566    int codelink;
567    int condcode;
568  int ctype;  int ctype;
569  int length;  int length;
570  int max;  int max;
# Line 635  for (;;) Line 639  for (;;)
639    {    {
640    minimize = possessive = FALSE;    minimize = possessive = FALSE;
641    op = *ecode;    op = *ecode;
642    
643    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
644    matching at least one subject character. */    matching at least one subject character. */
645    
# Line 787  for (;;) Line 791  for (;;)
791    
792      case OP_COND:      case OP_COND:
793      case OP_SCOND:      case OP_SCOND:
794      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      codelink= GET(ecode, 1);
795    
796        /* Because of the way auto-callout works during compile, a callout item is
797        inserted between OP_COND and an assertion condition. */
798    
799        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
800          {
801          if (pcre_callout != NULL)
802            {
803            pcre_callout_block cb;
804            cb.version          = 1;   /* Version 1 of the callout block */
805            cb.callout_number   = ecode[LINK_SIZE+2];
806            cb.offset_vector    = md->offset_vector;
807            cb.subject          = (PCRE_SPTR)md->start_subject;
808            cb.subject_length   = md->end_subject - md->start_subject;
809            cb.start_match      = mstart - md->start_subject;
810            cb.current_position = eptr - md->start_subject;
811            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
812            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
813            cb.capture_top      = offset_top/2;
814            cb.capture_last     = md->capture_last;
815            cb.callout_data     = md->callout_data;
816            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
817            if (rrc < 0) RRETURN(rrc);
818            }
819          ecode += _pcre_OP_lengths[OP_CALLOUT];
820          }
821    
822        condcode = ecode[LINK_SIZE+1];
823    
824        /* Now see what the actual condition is */
825    
826        if (condcode == OP_RREF)         /* Recursion test */
827        {        {
828        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
829        condition = md->recursive != NULL &&        condition = md->recursive != NULL &&
# Line 795  for (;;) Line 831  for (;;)
831        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
832        }        }
833    
834      else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */      else if (condcode == OP_CREF)    /* Group used test */
835        {        {
836        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
837        condition = offset < offset_top && md->offset_vector[offset] >= 0;        condition = offset < offset_top && md->offset_vector[offset] >= 0;
838        ecode += condition? 3 : GET(ecode, 1);        ecode += condition? 3 : GET(ecode, 1);
839        }        }
840    
841      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */      else if (condcode == OP_DEF)     /* DEFINE - always false */
842        {        {
843        condition = FALSE;        condition = FALSE;
844        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 829  for (;;) Line 865  for (;;)
865        else        else
866          {          {
867          condition = FALSE;          condition = FALSE;
868          ecode += GET(ecode, 1);          ecode += codelink;
869          }          }
870        }        }
871    
# Line 852  for (;;) Line 888  for (;;)
888          goto TAIL_RECURSE;          goto TAIL_RECURSE;
889          }          }
890        }        }
891      else                         /* Condition false & no 2nd alternative */      else                         /* Condition false & no alternative */
892        {        {
893        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
894        }        }
# Line 1075  for (;;) Line 1111  for (;;)
1111          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)          else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1112            {            {
1113            DPRINTF(("Recursion gave error %d\n", rrc));            DPRINTF(("Recursion gave error %d\n", rrc));
1114              if (new_recursive.offset_save != stacksave)
1115                (pcre_free)(new_recursive.offset_save);
1116            RRETURN(rrc);            RRETURN(rrc);
1117            }            }
1118    
# Line 1679  for (;;) Line 1717  for (;;)
1717      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1718      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1719        {        {
1720        const ucd_record * prop = GET_UCD(c);        const ucd_record *prop = GET_UCD(c);
1721    
1722        switch(ecode[1])        switch(ecode[1])
1723          {          {
# Line 2047  for (;;) Line 2085  for (;;)
2085    
2086    
2087      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2088      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2089        mode, because Unicode properties are supported in non-UTF-8 mode. */
2090    
2091  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2092      case OP_XCLASS:      case OP_XCLASS:
# Line 2089  for (;;) Line 2128  for (;;)
2128        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2129          {          {
2130          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2131          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2132          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2133          }          }
2134    
# Line 2108  for (;;) Line 2147  for (;;)
2147            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2148            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2149            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2150            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2151            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2152            }            }
2153          /* Control never gets here */          /* Control never gets here */
# Line 2123  for (;;) Line 2162  for (;;)
2162            {            {
2163            int len = 1;            int len = 1;
2164            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2165            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2166            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2167            eptr += len;            eptr += len;
2168            }            }
# Line 2609  for (;;) Line 2648  for (;;)
2648              {              {
2649              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2650              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2652              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2653              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2654              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2655                RRETURN(MATCH_NOMATCH);  
2656              }              }
2657            }            }
2658          else          else
# Line 2718  for (;;) Line 2758  for (;;)
2758              {              {
2759              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2760              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2763              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2764              }              }
2765            }            }
2766          else          else
# Line 4532  switch ((((options & PCRE_NEWLINE_BITS) Line 4572  switch ((((options & PCRE_NEWLINE_BITS)
4572          (pcre_uint32)options) & PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4573    {    {
4574    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4575    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4576    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4577    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4578         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4579    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4580    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4581    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4687  for(;;) Line 4727  for(;;)
4727      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4728      }      }
4729    
4730    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4731    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4732    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4733    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4734    the match fails at the newline, later code breaks this loop. */    this loop. */
4735    
4736    if (firstline)    if (firstline)
4737      {      {
4738      USPTR t = start_match;      USPTR t = start_match;
4739    #ifdef SUPPORT_UTF8
4740        if (utf8)
4741          {
4742          while (t < md->end_subject && !IS_NEWLINE(t))
4743            {
4744            t++;
4745            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4746            }
4747          }
4748        else
4749    #endif
4750      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4751      end_subject = t;      end_subject = t;
4752      }      }
4753    
4754    /* Now test for a unique first byte */    /* There are some optimizations that avoid running the match if a known
4755      starting point is not found, or if a known later character is not present.
4756      However, there is an option that disables these, for testing and for ensuring
4757      that all callouts do actually occur. */
4758    
4759    if (first_byte >= 0)    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4760      {      {
4761      if (first_byte_caseless)      /* Advance to a unique first byte if there is one. */
4762        while (start_match < end_subject &&  
4763               md->lcc[*start_match] != first_byte)      if (first_byte >= 0)
4764          { NEXTCHAR(start_match); }        {
4765      else        if (first_byte_caseless)
4766        while (start_match < end_subject && *start_match != first_byte)          while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4767          { NEXTCHAR(start_match); }            start_match++;
4768      }        else
4769            while (start_match < end_subject && *start_match != first_byte)
4770              start_match++;
4771          }
4772    
4773    /* Or to just after a linebreak for a multiline match if possible */      /* Or to just after a linebreak for a multiline match */
4774    
4775    else if (startline)      else if (startline)
     {  
     if (start_match > md->start_subject + start_offset)  
4776        {        {
4777        while (start_match <= end_subject && !WAS_NEWLINE(start_match))        if (start_match > md->start_subject + start_offset)
4778          { NEXTCHAR(start_match); }          {
4779    #ifdef SUPPORT_UTF8
4780            if (utf8)
4781              {
4782              while (start_match < end_subject && !WAS_NEWLINE(start_match))
4783                {
4784                start_match++;
4785                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4786                  start_match++;
4787                }
4788              }
4789            else
4790    #endif
4791            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4792              start_match++;
4793    
4794            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4795            and we are now at a LF, advance the match position by one more character.
4796            */
4797    
4798        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,          if (start_match[-1] == CHAR_CR &&
4799        and we are now at a LF, advance the match position by one more character.               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4800        */               start_match < end_subject &&
4801                 *start_match == CHAR_NL)
4802        if (start_match[-1] == '\r' &&            start_match++;
4803             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&          }
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4804        }        }
     }  
4805    
4806    /* Or to a non-unique first char after study */      /* Or to a non-unique first byte after study */
4807    
4808    else if (start_bits != NULL)      else if (start_bits != NULL)
     {  
     while (start_match < end_subject)  
4809        {        {
4810        register unsigned int c = *start_match;        while (start_match < end_subject)
4811        if ((start_bits[c/8] & (1 << (c&7))) == 0)          {
4812          { NEXTCHAR(start_match); }          register unsigned int c = *start_match;
4813        else break;          if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4814              else break;
4815            }
4816        }        }
4817      }      }   /* Starting optimizations */
4818    
4819    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4820    
# Line 4757  for(;;) Line 4826  for(;;)
4826    printf("\n");    printf("\n");
4827  #endif  #endif
4828    
4829    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4830    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4831    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4832    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4833    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4834    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4835    and backing off on a match.    autoincrement and backing off on a match.
4836    
4837    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4838    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4839    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4840    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4841      long.
4842    
4843    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4844    */    disabling is explicitly requested. */
4845    
4846    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4847          req_byte >= 0 &&
4848        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4849        !md->partial)        !md->partial)
4850      {      {
# Line 4881  for(;;) Line 4952  for(;;)
4952    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4953    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4954    
4955    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4956        start_match < end_subject &&        start_match < end_subject &&
4957        *start_match == '\n' &&        *start_match == CHAR_NL &&
4958        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4959          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4960           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.359  
changed lines
  Added in v.402

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12