/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 151 by ph10, Tue Apr 17 15:07:29 2007 UTC revision 199 by ph10, Tue Jul 31 14:39:09 2007 UTC
# Line 44  FSM). This is NOT Perl- compatible, but Line 44  FSM). This is NOT Perl- compatible, but
44  applications. */  applications. */
45    
46    
47    #ifdef HAVE_CONFIG_H
48    #include <config.h>
49    #endif
50    
51  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
52  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
53  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 63  applications. */ Line 67  applications. */
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
71    never stored, so we push them well clear of the normal opcodes. */
72    
73  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
74  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
75  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
76    #define OP_HSPACE_EXTRA     360
77    #define OP_VSPACE_EXTRA     380
78    
79    
80  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
81  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
82  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
83  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84  small value. */  small value. ***NOTE*** If the start of this table is modified, the two tables
85    that follow must also be modified. */
86    
87  static uschar coptable[] = {  static uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90      0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0,                          /* Any, Anybyte                           */    0, 0,                          /* Any, Anybyte                           */
92    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
95    1,                             /* Char                                   */    1,                             /* Char                                   */
96    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 127  static uschar coptable[] = { Line 137  static uschar coptable[] = {
137  and \w */  and \w */
138    
139  static uschar toptable1[] = {  static uschar toptable1[] = {
140    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
141    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
142    ctype_space, ctype_space,    ctype_space, ctype_space,
143    ctype_word,  ctype_word,    ctype_word,  ctype_word,
# Line 135  static uschar toptable1[] = { Line 145  static uschar toptable1[] = {
145  };  };
146    
147  static uschar toptable2[] = {  static uschar toptable2[] = {
148    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
149    ctype_digit, 0,    ctype_digit, 0,
150    ctype_space, 0,    ctype_space, 0,
151    ctype_word,  0,    ctype_word,  0,
# Line 500  for (;;) Line 510  for (;;)
510      const uschar *code;      const uschar *code;
511      int state_offset = current_state->offset;      int state_offset = current_state->offset;
512      int count, codevalue;      int count, codevalue;
513  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
514      int chartype, script;      int chartype, script;
515  #endif  #endif
516    
517  #ifdef DEBUG  #ifdef DEBUG
518      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 557  for (;;) Line 567  for (;;)
567      permitted.      permitted.
568    
569      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
571      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
572      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
573      opcodes. */      */
574    
575      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
576        {        {
# Line 578  for (;;) Line 588  for (;;)
588            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591              case OP_NOT_HSPACE:
592              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593              case OP_NOT_VSPACE:
594              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595            default: break;            default: break;
596            }            }
597          }          }
# Line 1088  for (;;) Line 1102  for (;;)
1102        break;        break;
1103    
1104        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1105          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1106          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1107          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1108          count = current_state->count;  /* Already matched */
1109          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1110          if (clen > 0)
1111            {
1112            BOOL OK;
1113            switch (c)
1114              {
1115              case 0x000a:
1116              case 0x000b:
1117              case 0x000c:
1118              case 0x000d:
1119              case 0x0085:
1120              case 0x2028:
1121              case 0x2029:
1122              OK = TRUE;
1123              break;
1124    
1125              default:
1126              OK = FALSE;
1127              break;
1128              }
1129    
1130            if (OK == (d == OP_VSPACE))
1131              {
1132              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1133                {
1134                active_count--;           /* Remove non-match possibility */
1135                next_active_state--;
1136                }
1137              count++;
1138              ADD_NEW_DATA(-state_offset, count, 0);
1139              }
1140            }
1141          break;
1142    
1143          /*-----------------------------------------------------------------*/
1144          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1145          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1146          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1147          count = current_state->count;  /* Already matched */
1148          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1149          if (clen > 0)
1150            {
1151            BOOL OK;
1152            switch (c)
1153              {
1154              case 0x09:      /* HT */
1155              case 0x20:      /* SPACE */
1156              case 0xa0:      /* NBSP */
1157              case 0x1680:    /* OGHAM SPACE MARK */
1158              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1159              case 0x2000:    /* EN QUAD */
1160              case 0x2001:    /* EM QUAD */
1161              case 0x2002:    /* EN SPACE */
1162              case 0x2003:    /* EM SPACE */
1163              case 0x2004:    /* THREE-PER-EM SPACE */
1164              case 0x2005:    /* FOUR-PER-EM SPACE */
1165              case 0x2006:    /* SIX-PER-EM SPACE */
1166              case 0x2007:    /* FIGURE SPACE */
1167              case 0x2008:    /* PUNCTUATION SPACE */
1168              case 0x2009:    /* THIN SPACE */
1169              case 0x200A:    /* HAIR SPACE */
1170              case 0x202f:    /* NARROW NO-BREAK SPACE */
1171              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1172              case 0x3000:    /* IDEOGRAPHIC SPACE */
1173              OK = TRUE;
1174              break;
1175    
1176              default:
1177              OK = FALSE;
1178              break;
1179              }
1180    
1181            if (OK == (d == OP_HSPACE))
1182              {
1183              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1184                {
1185                active_count--;           /* Remove non-match possibility */
1186                next_active_state--;
1187                }
1188              count++;
1189              ADD_NEW_DATA(-state_offset, count, 0);
1190              }
1191            }
1192          break;
1193    
1194          /*-----------------------------------------------------------------*/
1195  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1196        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1197        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
# Line 1231  for (;;) Line 1335  for (;;)
1335        break;        break;
1336    
1337        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1338          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1339          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1340          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1341          count = 2;
1342          goto QS4;
1343    
1344          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1345          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1346          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1347          count = 0;
1348    
1349          QS4:
1350          ADD_ACTIVE(state_offset + 2, 0);
1351          if (clen > 0)
1352            {
1353            BOOL OK;
1354            switch (c)
1355              {
1356              case 0x000a:
1357              case 0x000b:
1358              case 0x000c:
1359              case 0x000d:
1360              case 0x0085:
1361              case 0x2028:
1362              case 0x2029:
1363              OK = TRUE;
1364              break;
1365    
1366              default:
1367              OK = FALSE;
1368              break;
1369              }
1370            if (OK == (d == OP_VSPACE))
1371              {
1372              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1373                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1374                {
1375                active_count--;           /* Remove non-match possibility */
1376                next_active_state--;
1377                }
1378              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1379              }
1380            }
1381          break;
1382    
1383          /*-----------------------------------------------------------------*/
1384          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1385          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1386          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1387          count = 2;
1388          goto QS5;
1389    
1390          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1391          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1392          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1393          count = 0;
1394    
1395          QS5:
1396          ADD_ACTIVE(state_offset + 2, 0);
1397          if (clen > 0)
1398            {
1399            BOOL OK;
1400            switch (c)
1401              {
1402              case 0x09:      /* HT */
1403              case 0x20:      /* SPACE */
1404              case 0xa0:      /* NBSP */
1405              case 0x1680:    /* OGHAM SPACE MARK */
1406              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1407              case 0x2000:    /* EN QUAD */
1408              case 0x2001:    /* EM QUAD */
1409              case 0x2002:    /* EN SPACE */
1410              case 0x2003:    /* EM SPACE */
1411              case 0x2004:    /* THREE-PER-EM SPACE */
1412              case 0x2005:    /* FOUR-PER-EM SPACE */
1413              case 0x2006:    /* SIX-PER-EM SPACE */
1414              case 0x2007:    /* FIGURE SPACE */
1415              case 0x2008:    /* PUNCTUATION SPACE */
1416              case 0x2009:    /* THIN SPACE */
1417              case 0x200A:    /* HAIR SPACE */
1418              case 0x202f:    /* NARROW NO-BREAK SPACE */
1419              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1420              case 0x3000:    /* IDEOGRAPHIC SPACE */
1421              OK = TRUE;
1422              break;
1423    
1424              default:
1425              OK = FALSE;
1426              break;
1427              }
1428    
1429            if (OK == (d == OP_HSPACE))
1430              {
1431              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1432                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1433                {
1434                active_count--;           /* Remove non-match possibility */
1435                next_active_state--;
1436                }
1437              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1438              }
1439            }
1440          break;
1441    
1442          /*-----------------------------------------------------------------*/
1443  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1444        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1445        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
# Line 1359  for (;;) Line 1568  for (;;)
1568          }          }
1569        break;        break;
1570    
1571          /*-----------------------------------------------------------------*/
1572          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1573          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1574          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1575          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1576          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1577            { ADD_ACTIVE(state_offset + 4, 0); }
1578          count = current_state->count;  /* Number already matched */
1579          if (clen > 0)
1580            {
1581            BOOL OK;
1582            switch (c)
1583              {
1584              case 0x000a:
1585              case 0x000b:
1586              case 0x000c:
1587              case 0x000d:
1588              case 0x0085:
1589              case 0x2028:
1590              case 0x2029:
1591              OK = TRUE;
1592              break;
1593    
1594              default:
1595              OK = FALSE;
1596              }
1597    
1598            if (OK == (d == OP_VSPACE))
1599              {
1600              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1601                {
1602                active_count--;           /* Remove non-match possibility */
1603                next_active_state--;
1604                }
1605              if (++count >= GET2(code, 1))
1606                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1607              else
1608                { ADD_NEW_DATA(-state_offset, count, 0); }
1609              }
1610            }
1611          break;
1612    
1613          /*-----------------------------------------------------------------*/
1614          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1615          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1616          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1617          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1618          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1619            { ADD_ACTIVE(state_offset + 4, 0); }
1620          count = current_state->count;  /* Number already matched */
1621          if (clen > 0)
1622            {
1623            BOOL OK;
1624            switch (c)
1625              {
1626              case 0x09:      /* HT */
1627              case 0x20:      /* SPACE */
1628              case 0xa0:      /* NBSP */
1629              case 0x1680:    /* OGHAM SPACE MARK */
1630              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1631              case 0x2000:    /* EN QUAD */
1632              case 0x2001:    /* EM QUAD */
1633              case 0x2002:    /* EN SPACE */
1634              case 0x2003:    /* EM SPACE */
1635              case 0x2004:    /* THREE-PER-EM SPACE */
1636              case 0x2005:    /* FOUR-PER-EM SPACE */
1637              case 0x2006:    /* SIX-PER-EM SPACE */
1638              case 0x2007:    /* FIGURE SPACE */
1639              case 0x2008:    /* PUNCTUATION SPACE */
1640              case 0x2009:    /* THIN SPACE */
1641              case 0x200A:    /* HAIR SPACE */
1642              case 0x202f:    /* NARROW NO-BREAK SPACE */
1643              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1644              case 0x3000:    /* IDEOGRAPHIC SPACE */
1645              OK = TRUE;
1646              break;
1647    
1648              default:
1649              OK = FALSE;
1650              break;
1651              }
1652    
1653            if (OK == (d == OP_HSPACE))
1654              {
1655              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1656                {
1657                active_count--;           /* Remove non-match possibility */
1658                next_active_state--;
1659                }
1660              if (++count >= GET2(code, 1))
1661                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1662              else
1663                { ADD_NEW_DATA(-state_offset, count, 0); }
1664              }
1665            }
1666          break;
1667    
1668  /* ========================================================================== */  /* ========================================================================== */
1669        /* These opcodes are followed by a character that is usually compared        /* These opcodes are followed by a character that is usually compared
1670        to the current subject character; it is loaded into d. We still get        to the current subject character; it is loaded into d. We still get
# Line 1458  for (;;) Line 1764  for (;;)
1764        break;        break;
1765    
1766        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1767          case OP_NOT_VSPACE:
1768          if (clen > 0) switch(c)
1769            {
1770            case 0x000a:
1771            case 0x000b:
1772            case 0x000c:
1773            case 0x000d:
1774            case 0x0085:
1775            case 0x2028:
1776            case 0x2029:
1777            break;
1778    
1779            default:
1780            ADD_NEW(state_offset + 1, 0);
1781            break;
1782            }
1783          break;
1784    
1785          /*-----------------------------------------------------------------*/
1786          case OP_VSPACE:
1787          if (clen > 0) switch(c)
1788            {
1789            case 0x000a:
1790            case 0x000b:
1791            case 0x000c:
1792            case 0x000d:
1793            case 0x0085:
1794            case 0x2028:
1795            case 0x2029:
1796            ADD_NEW(state_offset + 1, 0);
1797            break;
1798    
1799            default: break;
1800            }
1801          break;
1802    
1803          /*-----------------------------------------------------------------*/
1804          case OP_NOT_HSPACE:
1805          if (clen > 0) switch(c)
1806            {
1807            case 0x09:      /* HT */
1808            case 0x20:      /* SPACE */
1809            case 0xa0:      /* NBSP */
1810            case 0x1680:    /* OGHAM SPACE MARK */
1811            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1812            case 0x2000:    /* EN QUAD */
1813            case 0x2001:    /* EM QUAD */
1814            case 0x2002:    /* EN SPACE */
1815            case 0x2003:    /* EM SPACE */
1816            case 0x2004:    /* THREE-PER-EM SPACE */
1817            case 0x2005:    /* FOUR-PER-EM SPACE */
1818            case 0x2006:    /* SIX-PER-EM SPACE */
1819            case 0x2007:    /* FIGURE SPACE */
1820            case 0x2008:    /* PUNCTUATION SPACE */
1821            case 0x2009:    /* THIN SPACE */
1822            case 0x200A:    /* HAIR SPACE */
1823            case 0x202f:    /* NARROW NO-BREAK SPACE */
1824            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1825            case 0x3000:    /* IDEOGRAPHIC SPACE */
1826            break;
1827    
1828            default:
1829            ADD_NEW(state_offset + 1, 0);
1830            break;
1831            }
1832          break;
1833    
1834          /*-----------------------------------------------------------------*/
1835          case OP_HSPACE:
1836          if (clen > 0) switch(c)
1837            {
1838            case 0x09:      /* HT */
1839            case 0x20:      /* SPACE */
1840            case 0xa0:      /* NBSP */
1841            case 0x1680:    /* OGHAM SPACE MARK */
1842            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1843            case 0x2000:    /* EN QUAD */
1844            case 0x2001:    /* EM QUAD */
1845            case 0x2002:    /* EN SPACE */
1846            case 0x2003:    /* EM SPACE */
1847            case 0x2004:    /* THREE-PER-EM SPACE */
1848            case 0x2005:    /* FOUR-PER-EM SPACE */
1849            case 0x2006:    /* SIX-PER-EM SPACE */
1850            case 0x2007:    /* FIGURE SPACE */
1851            case 0x2008:    /* PUNCTUATION SPACE */
1852            case 0x2009:    /* THIN SPACE */
1853            case 0x200A:    /* HAIR SPACE */
1854            case 0x202f:    /* NARROW NO-BREAK SPACE */
1855            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1856            case 0x3000:    /* IDEOGRAPHIC SPACE */
1857            ADD_NEW(state_offset + 1, 0);
1858            break;
1859            }
1860          break;
1861    
1862          /*-----------------------------------------------------------------*/
1863        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1864        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1865        checking (c) can be multibyte. */        checking (c) can be multibyte. */

Legend:
Removed from v.151  
changed lines
  Added in v.199

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12