/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC
# Line 43  alternative matching function that uses Line 43  alternative matching function that uses
43  compatible, but it has advantages in certain applications. */  compatible, but it has advantages in certain applications. */
44    
45    
46    #define NLBLOCK md           /* The block containing newline information */
47  #include "pcre_internal.h"  #include "pcre_internal.h"
48    
49    
# Line 423  ptr = current_subject; Line 424  ptr = current_subject;
424  for (;;)  for (;;)
425    {    {
426    int i, j;    int i, j;
427    int c, d, clen, dlen;    int clen, dlen;
428      unsigned int c, d;
429    
430    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
431    new state list. */    new state list. */
# Line 647  for (;;) Line 649  for (;;)
649        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
650        case OP_CIRC:        case OP_CIRC:
651        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
652            ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE))            ((ims & PCRE_MULTILINE) != 0 &&
653                ptr >= start_subject + md->nllen &&
654                ptr != end_subject &&
655                IS_NEWLINE(ptr - md->nllen)))
656          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
657        break;        break;
658    
# Line 681  for (;;) Line 686  for (;;)
686    
687        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
688        case OP_ANY:        case OP_ANY:
689        if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0))        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 ||
690                           ptr > end_subject - md->nllen ||
691                           !IS_NEWLINE(ptr)))
692          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
693        break;        break;
694    
695        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
696        case OP_EODN:        case OP_EODN:
697        if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject))        if (clen == 0 ||
698               (ptr == end_subject - md->nllen && IS_NEWLINE(ptr)))
699          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
700        break;        break;
701    
# Line 695  for (;;) Line 703  for (;;)
703        case OP_DOLL:        case OP_DOLL:
704        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
705          {          {
706          if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject ||          if (clen == 0 ||
707                                  (ims & PCRE_MULTILINE) != 0)))              (ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) &&
708                   ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
709                ))
710            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
711          }          }
712        else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0)        else if ((ims & PCRE_MULTILINE) != 0 &&
713                   ptr <= end_subject - md->nllen && IS_NEWLINE(ptr))
714          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
715        break;        break;
716    
# Line 811  for (;;) Line 822  for (;;)
822          {          {
823          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
824              (c < 256 &&              (c < 256 &&
825                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY ||
826                   (ims & PCRE_DOTALL) != 0 ||
827                   ptr > end_subject - md->nllen ||
828                   !IS_NEWLINE(ptr)
829                  ) &&
830                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
831            {            {
832            count++;            count++;
# Line 828  for (;;) Line 843  for (;;)
843          {          {
844          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
845              (c < 256 &&              (c < 256 &&
846                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY ||
847                   (ims & PCRE_DOTALL) != 0 ||
848                   ptr > end_subject - md->nllen ||
849                   !IS_NEWLINE(ptr)
850                  ) &&
851                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
852            {            {
853            ADD_NEW(state_offset + 2, 0);            ADD_NEW(state_offset + 2, 0);
# Line 844  for (;;) Line 863  for (;;)
863          {          {
864          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
865              (c < 256 &&              (c < 256 &&
866                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY ||
867                   (ims & PCRE_DOTALL) != 0 ||
868                   ptr > end_subject - md->nllen ||
869                   !IS_NEWLINE(ptr)
870                  ) &&
871                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
872            {            {
873            ADD_NEW(state_offset, 0);            ADD_NEW(state_offset, 0);
# Line 863  for (;;) Line 886  for (;;)
886          {          {
887          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
888              (c < 256 &&              (c < 256 &&
889                (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) &&                (d != OP_ANY ||
890                   (ims & PCRE_DOTALL) != 0 ||
891                   ptr > end_subject - md->nllen ||
892                   !IS_NEWLINE(ptr)
893                  ) &&
894                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
895            {            {
896            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 1220  for (;;) Line 1247  for (;;)
1247        if (clen > 0)        if (clen > 0)
1248          {          {
1249          int otherd = -1;          int otherd = -1;
1250          if ((ims && PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1251            {            {
1252  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1253            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 1247  for (;;) Line 1274  for (;;)
1274        if (clen > 0)        if (clen > 0)
1275          {          {
1276          int otherd = -1;          int otherd = -1;
1277          if ((ims && PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1278            {            {
1279  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1280            if (utf8 && d >= 128)            if (utf8 && d >= 128)
# Line 1370  for (;;) Line 1397  for (;;)
1397              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 5, 0); }
1398            if (isinclass)            if (isinclass)
1399              {              {
1400              if (++count >= GET2(ecode, 3))              int max = GET2(ecode, 3);
1401                if (++count >= max && max != 0)   /* Max 0 => no limit */
1402                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 5, 0); }
1403              else              else
1404                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
# Line 1670  for (;;) Line 1698  for (;;)
1698      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"      DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1699        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,        "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1700        rlevel*2-2, SP));        rlevel*2-2, SP));
1701      return match_count;      break;        /* In effect, "return", but see the comment below */
1702      }      }
1703    
1704    /* One or more states are active for the next character. */    /* One or more states are active for the next character. */
# Line 1678  for (;;) Line 1706  for (;;)
1706    ptr += clen;    /* Advance to next subject character */    ptr += clen;    /* Advance to next subject character */
1707    }               /* Loop to move along the subject string */    }               /* Loop to move along the subject string */
1708    
1709  /* Control never gets here, but we must keep the compiler happy. */  /* Control gets here from "break" a few lines above. We do it this way because
1710    if we use "return" above, we have compiler trouble. Some compilers warn if
1711    there's nothing here because they think the function doesn't return a value. On
1712    the other hand, if we put a dummy statement here, some more clever compilers
1713    complain that it can't be reached. Sigh. */
1714    
1715  DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n"  return match_count;
   "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP));  
 return PCRE_ERROR_NOMATCH;  
1716  }  }
1717    
1718    
# Line 1721  pcre_dfa_exec(const pcre *argument_re, c Line 1751  pcre_dfa_exec(const pcre *argument_re, c
1751  {  {
1752  real_pcre *re = (real_pcre *)argument_re;  real_pcre *re = (real_pcre *)argument_re;
1753  dfa_match_data match_block;  dfa_match_data match_block;
1754    dfa_match_data *md = &match_block;
1755  BOOL utf8, anchored, startline, firstline;  BOOL utf8, anchored, startline, firstline;
1756  const uschar *current_subject, *end_subject, *lcc;  const uschar *current_subject, *end_subject, *lcc;
1757    
# Line 1735  BOOL req_byte_caseless = FALSE; Line 1766  BOOL req_byte_caseless = FALSE;
1766  int first_byte = -1;  int first_byte = -1;
1767  int req_byte = -1;  int req_byte = -1;
1768  int req_byte2 = -1;  int req_byte2 = -1;
1769    int newline;
1770    
1771  /* Plausibility checks */  /* Plausibility checks */
1772    
# Line 1749  flipping, so we scan the extra_data bloc Line 1781  flipping, so we scan the extra_data bloc
1781  match block, so we must initialize them beforehand. However, the other fields  match block, so we must initialize them beforehand. However, the other fields
1782  in the match block must not be set until after the byte flipping. */  in the match block must not be set until after the byte flipping. */
1783    
1784  match_block.tables = re->tables;  md->tables = re->tables;
1785  match_block.callout_data = NULL;  md->callout_data = NULL;
1786    
1787  if (extra_data != NULL)  if (extra_data != NULL)
1788    {    {
# Line 1761  if (extra_data != NULL) Line 1793  if (extra_data != NULL)
1793    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)    if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
1794      return PCRE_ERROR_DFA_UMLIMIT;      return PCRE_ERROR_DFA_UMLIMIT;
1795    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1796      match_block.callout_data = extra_data->callout_data;      md->callout_data = extra_data->callout_data;
1797    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
1798      match_block.tables = extra_data->tables;      md->tables = extra_data->tables;
1799    }    }
1800    
1801  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
# Line 1784  current_subject = (const unsigned char * Line 1816  current_subject = (const unsigned char *
1816  end_subject = (const unsigned char *)subject + length;  end_subject = (const unsigned char *)subject + length;
1817  req_byte_ptr = current_subject - 1;  req_byte_ptr = current_subject - 1;
1818    
1819    #ifdef SUPPORT_UTF8
1820  utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = (re->options & PCRE_UTF8) != 0;
1821    #else
1822    utf8 = FALSE;
1823    #endif
1824    
1825  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
1826    (re->options & PCRE_ANCHORED) != 0;    (re->options & PCRE_ANCHORED) != 0;
1827    
1828  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
1829    
1830  match_block.start_code = (const uschar *)argument_re +  md->start_code = (const uschar *)argument_re +
1831      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
1832  match_block.start_subject = (const unsigned char *)subject;  md->start_subject = (const unsigned char *)subject;
1833  match_block.end_subject = end_subject;  md->end_subject = end_subject;
1834  match_block.moptions = options;  md->moptions = options;
1835  match_block.poptions = re->options;  md->poptions = re->options;
1836    
1837    /* Handle different types of newline. The two bits give four cases. If nothing
1838    is set at run time, whatever was used at compile time applies. */
1839    
1840    switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
1841             PCRE_NEWLINE_CRLF)
1842      {
1843      default:              newline = NEWLINE; break;   /* Compile-time default */
1844      case PCRE_NEWLINE_CR: newline = '\r'; break;
1845      case PCRE_NEWLINE_LF: newline = '\n'; break;
1846      case PCRE_NEWLINE_CR+
1847           PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
1848      }
1849    
1850    if (newline > 255)
1851      {
1852      md->nllen = 2;
1853      md->nl[0] = (newline >> 8) & 255;
1854      md->nl[1] = newline & 255;
1855      }
1856    else
1857      {
1858      md->nllen = 1;
1859      md->nl[0] = newline;
1860      }
1861    
1862  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1863  back the character offset. */  back the character offset. */
# Line 1822  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 1883  if (utf8 && (options & PCRE_NO_UTF8_CHEC
1883  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
1884  in other programs later. */  in other programs later. */
1885    
1886  if (match_block.tables == NULL) match_block.tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = _pcre_default_tables;
1887    
1888  /* The lower casing table and the "must be at the start of a line" flag are  /* The lower casing table and the "must be at the start of a line" flag are
1889  used in a loop when finding where to start. */  used in a loop when finding where to start. */
1890    
1891  lcc = match_block.tables + lcc_offset;  lcc = md->tables + lcc_offset;
1892  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->options & PCRE_STARTLINE) != 0;
1893  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
1894    
# Line 1860  if ((re->options & PCRE_REQCHSET) != 0) Line 1921  if ((re->options & PCRE_REQCHSET) != 0)
1921    {    {
1922    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
1923    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1924    req_byte2 = (match_block.tables + fcc_offset)[req_byte];  /* case flipped */    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
1925    }    }
1926    
1927  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
# Line 1885  for (;;) Line 1946  for (;;)
1946      if (firstline)      if (firstline)
1947        {        {
1948        const uschar *t = current_subject;        const uschar *t = current_subject;
1949        while (t < save_end_subject && *t != '\n') t++;        while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
1950        end_subject = t;        end_subject = t;
1951        }        }
1952    
# Line 1900  for (;;) Line 1961  for (;;)
1961            current_subject++;            current_subject++;
1962        }        }
1963    
1964      /* Or to just after \n for a multiline match if possible */      /* Or to just after a linebreak for a multiline match if possible */
1965    
1966      else if (startline)      else if (startline)
1967        {        {
1968        if (current_subject > match_block.start_subject + start_offset)        if (current_subject > md->start_subject + md->nllen +
1969              start_offset)
1970          {          {
1971          while (current_subject < end_subject && current_subject[-1] != NEWLINE)          while (current_subject <= end_subject &&
1972                   !IS_NEWLINE(current_subject - md->nllen))
1973            current_subject++;            current_subject++;
1974          }          }
1975        }        }
# Line 1987  for (;;) Line 2050  for (;;)
2050    /* OK, now we can do the business */    /* OK, now we can do the business */
2051    
2052    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2053      &match_block,                              /* fixed match data */      md,                                /* fixed match data */
2054      match_block.start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
2055      current_subject,                           /* where we currently are */      current_subject,                   /* where we currently are */
2056      start_offset,                              /* start offset in subject */      start_offset,                      /* start offset in subject */
2057      offsets,                                   /* offset vector */      offsets,                           /* offset vector */
2058      offsetcount,                               /* size of same */      offsetcount,                       /* size of same */
2059      workspace,                                 /* workspace vector */      workspace,                         /* workspace vector */
2060      wscount,                                   /* size of same */      wscount,                           /* size of same */
2061      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2062      0,                                         /* function recurse level */      0,                                 /* function recurse level */
2063      0);                                        /* regex recurse level */      0);                                /* regex recurse level */
2064    
2065    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
2066    on only if not anchored. */    on only if not anchored. */
# Line 2007  for (;;) Line 2070  for (;;)
2070    /* Advance to the next subject character unless we are at the end of a line    /* Advance to the next subject character unless we are at the end of a line
2071    and firstline is set. */    and firstline is set. */
2072    
2073    if (firstline && *current_subject == NEWLINE) break;    if (firstline &&
2074          current_subject <= end_subject - md->nllen &&
2075          IS_NEWLINE(current_subject)) break;
2076    current_subject++;    current_subject++;
   
 #ifdef SUPPORT_UTF8  
2077    if (utf8)    if (utf8)
2078      {      {
2079      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2080        current_subject++;        current_subject++;
2081      }      }
 #endif  
   
2082    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2083    }    }
2084    

Legend:
Removed from v.87  
changed lines
  Added in v.91

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12