/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 351 by ph10, Fri Jul 4 18:27:16 2008 UTC revision 391 by ph10, Tue Mar 17 21:16:01 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 158  printf("\n"); Line 158  printf("\n");
158    
159  if (length > md->end_subject - eptr) return FALSE;  if (length > md->end_subject - eptr) return FALSE;
160    
161  /* Separate the caselesss case for speed */  /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162    properly if Unicode properties are supported. Otherwise, we can check only
163    ASCII characters. */
164    
165  if ((ims & PCRE_CASELESS) != 0)  if ((ims & PCRE_CASELESS) != 0)
166    {    {
167    #ifdef SUPPORT_UTF8
168    #ifdef SUPPORT_UCP
169      if (md->utf8)
170        {
171        USPTR endptr = eptr + length;
172        while (eptr < endptr)
173          {
174          int c, d;
175          GETCHARINC(c, eptr);
176          GETCHARINC(d, p);
177          if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178          }
179        }
180      else
181    #endif
182    #endif
183    
184      /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185      is no UCP support. */
186    
187    while (length-- > 0)    while (length-- > 0)
188      if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;      { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189    }    }
190    
191    /* In the caseful case, we can just compare the bytes, whether or not we
192    are in UTF-8 mode. */
193    
194  else  else
195    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
# Line 761  for (;;) Line 787  for (;;)
787    
788      case OP_COND:      case OP_COND:
789      case OP_SCOND:      case OP_SCOND:
790        /* Because of the way auto-callout works during compile, a callout item is
791        inserted between OP_COND and an assertion condition. */
792    
793        if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794          {
795          if (pcre_callout != NULL)
796            {
797            pcre_callout_block cb;
798            cb.version          = 1;   /* Version 1 of the callout block */
799            cb.callout_number   = ecode[LINK_SIZE+2];
800            cb.offset_vector    = md->offset_vector;
801            cb.subject          = (PCRE_SPTR)md->start_subject;
802            cb.subject_length   = md->end_subject - md->start_subject;
803            cb.start_match      = mstart - md->start_subject;
804            cb.current_position = eptr - md->start_subject;
805            cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806            cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807            cb.capture_top      = offset_top/2;
808            cb.capture_last     = md->capture_last;
809            cb.callout_data     = md->callout_data;
810            if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811            if (rrc < 0) RRETURN(rrc);
812            }
813          ecode += _pcre_OP_lengths[OP_CALLOUT];
814          }
815    
816        /* Now see what the actual condition is */
817    
818      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */      if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
819        {        {
820        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/        offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
# Line 1653  for (;;) Line 1707  for (;;)
1707      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708      GETCHARINCTEST(c, eptr);      GETCHARINCTEST(c, eptr);
1709        {        {
1710        const ucd_record * prop = GET_UCD(c);        const ucd_record *prop = GET_UCD(c);
1711    
1712        switch(ecode[1])        switch(ecode[1])
1713          {          {
# Line 2021  for (;;) Line 2075  for (;;)
2075    
2076    
2077      /* Match an extended character class. This opcode is encountered only      /* Match an extended character class. This opcode is encountered only
2078      in UTF-8 mode, because that's the only time it is compiled. */      when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2079        mode, because Unicode properties are supported in non-UTF-8 mode. */
2080    
2081  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2082      case OP_XCLASS:      case OP_XCLASS:
# Line 2063  for (;;) Line 2118  for (;;)
2118        for (i = 1; i <= min; i++)        for (i = 1; i <= min; i++)
2119          {          {
2120          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2121          GETCHARINC(c, eptr);          GETCHARINCTEST(c, eptr);
2122          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);          if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2123          }          }
2124    
# Line 2082  for (;;) Line 2137  for (;;)
2137            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);            RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2138            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2139            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2140            GETCHARINC(c, eptr);            GETCHARINCTEST(c, eptr);
2141            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);            if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2142            }            }
2143          /* Control never gets here */          /* Control never gets here */
# Line 2097  for (;;) Line 2152  for (;;)
2152            {            {
2153            int len = 1;            int len = 1;
2154            if (eptr >= md->end_subject) break;            if (eptr >= md->end_subject) break;
2155            GETCHARLEN(c, eptr, len);            GETCHARLENTEST(c, eptr, len);
2156            if (!_pcre_xclass(c, data)) break;            if (!_pcre_xclass(c, data)) break;
2157            eptr += len;            eptr += len;
2158            }            }
# Line 2583  for (;;) Line 2638  for (;;)
2638              {              {
2639              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2640              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2641                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2643              if (d < 256) d = md->lcc[d];              if (d < 256) d = md->lcc[d];
2644              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
2645                RRETURN(MATCH_NOMATCH);  
2646              }              }
2647            }            }
2648          else          else
# Line 2692  for (;;) Line 2748  for (;;)
2748              {              {
2749              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);              RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2750              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751                if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2752              GETCHARINC(d, eptr);              GETCHARINC(d, eptr);
2753              if (fi >= max || eptr >= md->end_subject || fc == d)              if (fc == d) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2754              }              }
2755            }            }
2756          else          else
# Line 4358  Returns: > 0 => success; value Line 4414  Returns: > 0 => success; value
4414                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
4415  */  */
4416    
4417  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4418  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4419    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,    PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4420    int offsetcount)    int offsetcount)
# Line 4506  switch ((((options & PCRE_NEWLINE_BITS) Line 4562  switch ((((options & PCRE_NEWLINE_BITS)
4562          (pcre_uint32)options) & PCRE_NEWLINE_BITS)          (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4563    {    {
4564    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
4565    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4566    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4567    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
4568         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4569    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
4570    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4571    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 4660  for(;;) Line 4716  for(;;)
4716      register int *iend = iptr + resetcount;      register int *iend = iptr + resetcount;
4717      while (iptr < iend) *iptr++ = -1;      while (iptr < iend) *iptr++ = -1;
4718      }      }
4719    
4720    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* If firstline is TRUE, the start of the match is constrained to the first
4721    start of the match is constrained to the first line of a multiline string.    line of a multiline string. That is, the match must be before or at the first
4722    That is, the match must be before or at the first newline. Implement this by    newline. Implement this by temporarily adjusting end_subject so that we stop
4723    temporarily adjusting end_subject so that we stop scanning at a newline. If    scanning at a newline. If the match fails at the newline, later code breaks
4724    the match fails at the newline, later code breaks this loop. */    this loop. */
4725    
4726    if (firstline)    if (firstline)
4727      {      {
4728      USPTR t = start_match;      USPTR t = start_match;
4729    #ifdef SUPPORT_UTF8
4730        if (utf8)
4731          {
4732          while (t < md->end_subject && !IS_NEWLINE(t))
4733            {
4734            t++;
4735            while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4736            }
4737          }
4738        else
4739    #endif
4740      while (t < md->end_subject && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4741      end_subject = t;      end_subject = t;
4742      }      }
4743    
4744    /* Now test for a unique first byte */    /* There are some optimizations that avoid running the match if a known
4745      starting point is not found, or if a known later character is not present.
4746    if (first_byte >= 0)    However, there is an option that disables these, for testing and for ensuring
4747      {    that all callouts do actually occur. */
4748      if (first_byte_caseless)  
4749        while (start_match < end_subject &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4750               md->lcc[*start_match] != first_byte)      {
4751          { NEXTCHAR(start_match); }      /* Advance to a unique first byte if there is one. */
4752      else  
4753        while (start_match < end_subject && *start_match != first_byte)      if (first_byte >= 0)
4754          { NEXTCHAR(start_match); }        {
4755      }        if (first_byte_caseless)
4756            while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4757    /* Or to just after a linebreak for a multiline match if possible */            start_match++;
4758          else
4759    else if (startline)          while (start_match < end_subject && *start_match != first_byte)
4760      {            start_match++;
     if (start_match > md->start_subject + start_offset)  
       {  
       while (start_match <= end_subject && !WAS_NEWLINE(start_match))  
         { NEXTCHAR(start_match); }  
   
       /* If we have just passed a CR and the newline option is ANY or ANYCRLF,  
       and we are now at a LF, advance the match position by one more character.  
       */  
   
       if (start_match[-1] == '\r' &&  
            (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&  
            start_match < end_subject &&  
            *start_match == '\n')  
         start_match++;  
4761        }        }
4762      }  
4763        /* Or to just after a linebreak for a multiline match */
4764    /* Or to a non-unique first char after study */  
4765        else if (startline)
   else if (start_bits != NULL)  
     {  
     while (start_match < end_subject)  
4766        {        {
4767        register unsigned int c = *start_match;        if (start_match > md->start_subject + start_offset)
4768        if ((start_bits[c/8] & (1 << (c&7))) == 0)          {
4769          { NEXTCHAR(start_match); }  #ifdef SUPPORT_UTF8
4770        else break;          if (utf8)
4771              {
4772              while (start_match < end_subject && !WAS_NEWLINE(start_match))
4773                {
4774                start_match++;
4775                while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4776                  start_match++;
4777                }
4778              }
4779            else
4780    #endif
4781            while (start_match < end_subject && !WAS_NEWLINE(start_match))
4782              start_match++;
4783    
4784            /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4785            and we are now at a LF, advance the match position by one more character.
4786            */
4787    
4788            if (start_match[-1] == CHAR_CR &&
4789                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4790                 start_match < end_subject &&
4791                 *start_match == CHAR_NL)
4792              start_match++;
4793            }
4794          }
4795    
4796        /* Or to a non-unique first byte after study */
4797    
4798        else if (start_bits != NULL)
4799          {
4800          while (start_match < end_subject)
4801            {
4802            register unsigned int c = *start_match;
4803            if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4804              else break;
4805            }
4806        }        }
4807      }      }   /* Starting optimizations */
4808    
4809    /* Restore fudged end_subject */    /* Restore fudged end_subject */
4810    
4811    end_subject = save_end_subject;    end_subject = save_end_subject;
4812    
4813  #ifdef DEBUG  /* Sigh. Some compilers never learn. */  #ifdef DEBUG  /* Sigh. Some compilers never learn. */
# Line 4731  for(;;) Line 4816  for(;;)
4816    printf("\n");    printf("\n");
4817  #endif  #endif
4818    
4819    /* If req_byte is set, we know that that character must appear in the subject    /* If req_byte is set, we know that that character must appear in the
4820    for the match to succeed. If the first character is set, req_byte must be    subject for the match to succeed. If the first character is set, req_byte
4821    later in the subject; otherwise the test starts at the match point. This    must be later in the subject; otherwise the test starts at the match point.
4822    optimization can save a huge amount of backtracking in patterns with nested    This optimization can save a huge amount of backtracking in patterns with
4823    unlimited repeats that aren't going to match. Writing separate code for    nested unlimited repeats that aren't going to match. Writing separate code
4824    cased/caseless versions makes it go faster, as does using an autoincrement    for cased/caseless versions makes it go faster, as does using an
4825    and backing off on a match.    autoincrement and backing off on a match.
4826    
4827    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end
4828    take a long time, and give bad performance on quite ordinary patterns. This    can take a long time, and give bad performance on quite ordinary patterns.
4829    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte    This showed up when somebody was matching something like /^\d+C/ on a
4830    string... so we don't do this when the string is sufficiently long.    32-megabyte string... so we don't do this when the string is sufficiently
4831      long.
4832    
4833    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, or if
4834    */    disabling is explicitly requested. */
4835    
4836    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4837          req_byte >= 0 &&
4838        end_subject - start_match < REQ_BYTE_MAX &&        end_subject - start_match < REQ_BYTE_MAX &&
4839        !md->partial)        !md->partial)
4840      {      {
# Line 4855  for(;;) Line 4942  for(;;)
4942    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
4943    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
4944    
4945    if (start_match[-1] == '\r' &&    if (start_match[-1] == CHAR_CR &&
4946        start_match < end_subject &&        start_match < end_subject &&
4947        *start_match == '\n' &&        *start_match == CHAR_NL &&
4948        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
4949          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
4950           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.351  
changed lines
  Added in v.391

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12