/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 96 by nigel, Fri Mar 2 13:10:43 2007 UTC revision 335 by ph10, Sat Apr 12 14:36:14 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 44  FSM). This is NOT Perl- compatible, but
44  applications. */  applications. */
45    
46    
47    #ifdef HAVE_CONFIG_H
48    #include "config.h"
49    #endif
50    
51  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
52  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
53  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 63  applications. */ Line 67  applications. */
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
71    never stored, so we push them well clear of the normal opcodes. */
72    
73  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
74  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
75  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
76    #define OP_HSPACE_EXTRA     360
77    #define OP_VSPACE_EXTRA     380
78    
79    
80  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
81  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
82  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
83  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84  small value. */  small value. ***NOTE*** If the start of this table is modified, the two tables
85    that follow must also be modified. */
86    
87  static uschar coptable[] = {  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90      0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0,                          /* Any, Anybyte                           */    0, 0,                          /* Any, Anybyte                           */
92    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
95    1,                             /* Char                                   */    1,                             /* Char                                   */
96    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 120  static uschar coptable[] = { Line 130  static uschar coptable[] = {
130    0,                             /* CREF                                   */    0,                             /* CREF                                   */
131    0,                             /* RREF                                   */    0,                             /* RREF                                   */
132    0,                             /* DEF                                    */    0,                             /* DEF                                    */
133    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135      0, 0                           /* FAIL, ACCEPT                           */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139  and \w */  and \w */
140    
141  static uschar toptable1[] = {  static const uschar toptable1[] = {
142    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0                               /* OP_ANY */
147  };  };
148    
149  static uschar toptable2[] = {  static const uschar toptable2[] = {
150    0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
# Line 500  for (;;) Line 512  for (;;)
512      const uschar *code;      const uschar *code;
513      int state_offset = current_state->offset;      int state_offset = current_state->offset;
514      int count, codevalue;      int count, codevalue;
515    #ifdef SUPPORT_UCP
516      int chartype, script;      int chartype, script;
517    #endif
518    
519  #ifdef DEBUG  #ifdef DEBUG
520      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 555  for (;;) Line 569  for (;;)
569      permitted.      permitted.
570    
571      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
573      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
574      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
575      opcodes. */      */
576    
577      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
578        {        {
# Line 576  for (;;) Line 590  for (;;)
590            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593              case OP_NOT_HSPACE:
594              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595              case OP_NOT_VSPACE:
596              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597            default: break;            default: break;
598            }            }
599          }          }
# Line 676  for (;;) Line 694  for (;;)
694        break;        break;
695    
696        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
697          case OP_SKIPZERO:
698          code += 1 + GET(code, 2);
699          while (*code == OP_ALT) code += GET(code, 1);
700          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701          break;
702    
703          /*-----------------------------------------------------------------*/
704        case OP_CIRC:        case OP_CIRC:
705        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 783  for (;;) Line 808  for (;;)
808        break;        break;
809    
810    
 #ifdef SUPPORT_UCP  
   
811        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
812        /* Check the next character by Unicode property. We will get here only        /* Check the next character by Unicode property. We will get here only
813        if the support is in the binary; otherwise a compile-time error occurs.        if the support is in the binary; otherwise a compile-time error occurs.
814        */        */
815    
816    #ifdef SUPPORT_UCP
817        case OP_PROP:        case OP_PROP:
818        case OP_NOTPROP:        case OP_NOTPROP:
819        if (clen > 0)        if (clen > 0)
# Line 970  for (;;) Line 994  for (;;)
994        argument. It keeps the code above fast for the other cases. The argument        argument. It keeps the code above fast for the other cases. The argument
995        is in the d variable. */        is in the d variable. */
996    
997    #ifdef SUPPORT_UCP
998        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
999        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1000        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:        case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
# Line 1049  for (;;) Line 1074  for (;;)
1074          ADD_NEW_DATA(-state_offset, count, ncount);          ADD_NEW_DATA(-state_offset, count, ncount);
1075          }          }
1076        break;        break;
1077    #endif
1078    
1079        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1080        case OP_ANYNL_EXTRA + OP_TYPEPLUS:        case OP_ANYNL_EXTRA + OP_TYPEPLUS:
# Line 1061  for (;;) Line 1087  for (;;)
1087          int ncount = 0;          int ncount = 0;
1088          switch (c)          switch (c)
1089            {            {
1090              case 0x000b:
1091              case 0x000c:
1092              case 0x0085:
1093              case 0x2028:
1094              case 0x2029:
1095              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1096              goto ANYNL01;
1097    
1098            case 0x000d:            case 0x000d:
1099            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1100            /* Fall through */            /* Fall through */
1101    
1102              ANYNL01:
1103              case 0x000a:
1104              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1105                {
1106                active_count--;           /* Remove non-match possibility */
1107                next_active_state--;
1108                }
1109              count++;
1110              ADD_NEW_DATA(-state_offset, count, ncount);
1111              break;
1112    
1113              default:
1114              break;
1115              }
1116            }
1117          break;
1118    
1119          /*-----------------------------------------------------------------*/
1120          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1121          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1122          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1123          count = current_state->count;  /* Already matched */
1124          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1125          if (clen > 0)
1126            {
1127            BOOL OK;
1128            switch (c)
1129              {
1130            case 0x000a:            case 0x000a:
1131            case 0x000b:            case 0x000b:
1132            case 0x000c:            case 0x000c:
1133              case 0x000d:
1134            case 0x0085:            case 0x0085:
1135            case 0x2028:            case 0x2028:
1136            case 0x2029:            case 0x2029:
1137            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1138              break;
1139    
1140              default:
1141              OK = FALSE;
1142              break;
1143              }
1144    
1145            if (OK == (d == OP_VSPACE))
1146              {
1147              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1148              {              {
1149              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1150              next_active_state--;              next_active_state--;
1151              }              }
1152            count++;            count++;
1153            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1154              }
1155            }
1156          break;
1157    
1158          /*-----------------------------------------------------------------*/
1159          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1160          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1161          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1162          count = current_state->count;  /* Already matched */
1163          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1164          if (clen > 0)
1165            {
1166            BOOL OK;
1167            switch (c)
1168              {
1169              case 0x09:      /* HT */
1170              case 0x20:      /* SPACE */
1171              case 0xa0:      /* NBSP */
1172              case 0x1680:    /* OGHAM SPACE MARK */
1173              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1174              case 0x2000:    /* EN QUAD */
1175              case 0x2001:    /* EM QUAD */
1176              case 0x2002:    /* EN SPACE */
1177              case 0x2003:    /* EM SPACE */
1178              case 0x2004:    /* THREE-PER-EM SPACE */
1179              case 0x2005:    /* FOUR-PER-EM SPACE */
1180              case 0x2006:    /* SIX-PER-EM SPACE */
1181              case 0x2007:    /* FIGURE SPACE */
1182              case 0x2008:    /* PUNCTUATION SPACE */
1183              case 0x2009:    /* THIN SPACE */
1184              case 0x200A:    /* HAIR SPACE */
1185              case 0x202f:    /* NARROW NO-BREAK SPACE */
1186              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1187              case 0x3000:    /* IDEOGRAPHIC SPACE */
1188              OK = TRUE;
1189            break;            break;
1190    
1191            default:            default:
1192              OK = FALSE;
1193            break;            break;
1194            }            }
1195    
1196            if (OK == (d == OP_HSPACE))
1197              {
1198              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1199                {
1200                active_count--;           /* Remove non-match possibility */
1201                next_active_state--;
1202                }
1203              count++;
1204              ADD_NEW_DATA(-state_offset, count, 0);
1205              }
1206          }          }
1207        break;        break;
1208    
1209        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1210    #ifdef SUPPORT_UCP
1211        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
1212        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1213        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:        case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
# Line 1182  for (;;) Line 1305  for (;;)
1305          ADD_NEW_DATA(-(state_offset + count), 0, ncount);          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1306          }          }
1307        break;        break;
1308    #endif
1309    
1310        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1311        case OP_ANYNL_EXTRA + OP_TYPEQUERY:        case OP_ANYNL_EXTRA + OP_TYPEQUERY:
# Line 1202  for (;;) Line 1326  for (;;)
1326          int ncount = 0;          int ncount = 0;
1327          switch (c)          switch (c)
1328            {            {
1329              case 0x000b:
1330              case 0x000c:
1331              case 0x0085:
1332              case 0x2028:
1333              case 0x2029:
1334              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1335              goto ANYNL02;
1336    
1337            case 0x000d:            case 0x000d:
1338            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339            /* Fall through */            /* Fall through */
1340    
1341              ANYNL02:
1342              case 0x000a:
1343              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1344                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1345                {
1346                active_count--;           /* Remove non-match possibility */
1347                next_active_state--;
1348                }
1349              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1350              break;
1351    
1352              default:
1353              break;
1354              }
1355            }
1356          break;
1357    
1358          /*-----------------------------------------------------------------*/
1359          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1360          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1361          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1362          count = 2;
1363          goto QS4;
1364    
1365          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1366          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1367          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1368          count = 0;
1369    
1370          QS4:
1371          ADD_ACTIVE(state_offset + 2, 0);
1372          if (clen > 0)
1373            {
1374            BOOL OK;
1375            switch (c)
1376              {
1377            case 0x000a:            case 0x000a:
1378            case 0x000b:            case 0x000b:
1379            case 0x000c:            case 0x000c:
1380              case 0x000d:
1381            case 0x0085:            case 0x0085:
1382            case 0x2028:            case 0x2028:
1383            case 0x2029:            case 0x2029:
1384            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1385                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1386    
1387              default:
1388              OK = FALSE;
1389              break;
1390              }
1391            if (OK == (d == OP_VSPACE))
1392              {
1393              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1394                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1395              {              {
1396              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1397              next_active_state--;              next_active_state--;
1398              }              }
1399            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1400              }
1401            }
1402          break;
1403    
1404          /*-----------------------------------------------------------------*/
1405          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1406          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1407          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1408          count = 2;
1409          goto QS5;
1410    
1411          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1412          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1413          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1414          count = 0;
1415    
1416          QS5:
1417          ADD_ACTIVE(state_offset + 2, 0);
1418          if (clen > 0)
1419            {
1420            BOOL OK;
1421            switch (c)
1422              {
1423              case 0x09:      /* HT */
1424              case 0x20:      /* SPACE */
1425              case 0xa0:      /* NBSP */
1426              case 0x1680:    /* OGHAM SPACE MARK */
1427              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1428              case 0x2000:    /* EN QUAD */
1429              case 0x2001:    /* EM QUAD */
1430              case 0x2002:    /* EN SPACE */
1431              case 0x2003:    /* EM SPACE */
1432              case 0x2004:    /* THREE-PER-EM SPACE */
1433              case 0x2005:    /* FOUR-PER-EM SPACE */
1434              case 0x2006:    /* SIX-PER-EM SPACE */
1435              case 0x2007:    /* FIGURE SPACE */
1436              case 0x2008:    /* PUNCTUATION SPACE */
1437              case 0x2009:    /* THIN SPACE */
1438              case 0x200A:    /* HAIR SPACE */
1439              case 0x202f:    /* NARROW NO-BREAK SPACE */
1440              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1441              case 0x3000:    /* IDEOGRAPHIC SPACE */
1442              OK = TRUE;
1443            break;            break;
1444    
1445            default:            default:
1446              OK = FALSE;
1447            break;            break;
1448            }            }
1449    
1450            if (OK == (d == OP_HSPACE))
1451              {
1452              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1453                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1454                {
1455                active_count--;           /* Remove non-match possibility */
1456                next_active_state--;
1457                }
1458              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1459              }
1460          }          }
1461        break;        break;
1462    
1463        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1464    #ifdef SUPPORT_UCP
1465        case OP_PROP_EXTRA + OP_TYPEEXACT:        case OP_PROP_EXTRA + OP_TYPEEXACT:
1466        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1467        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
# Line 1313  for (;;) Line 1549  for (;;)
1549            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1550          }          }
1551        break;        break;
1552    #endif
1553    
1554        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1555        case OP_ANYNL_EXTRA + OP_TYPEEXACT:        case OP_ANYNL_EXTRA + OP_TYPEEXACT:
# Line 1327  for (;;) Line 1564  for (;;)
1564          int ncount = 0;          int ncount = 0;
1565          switch (c)          switch (c)
1566            {            {
1567              case 0x000b:
1568              case 0x000c:
1569              case 0x0085:
1570              case 0x2028:
1571              case 0x2029:
1572              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1573              goto ANYNL03;
1574    
1575            case 0x000d:            case 0x000d:
1576            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1577            /* Fall through */            /* Fall through */
1578    
1579              ANYNL03:
1580              case 0x000a:
1581              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1582                {
1583                active_count--;           /* Remove non-match possibility */
1584                next_active_state--;
1585                }
1586              if (++count >= GET2(code, 1))
1587                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1588              else
1589                { ADD_NEW_DATA(-state_offset, count, ncount); }
1590              break;
1591    
1592              default:
1593              break;
1594              }
1595            }
1596          break;
1597    
1598          /*-----------------------------------------------------------------*/
1599          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1600          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1601          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1602          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1603          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1604            { ADD_ACTIVE(state_offset + 4, 0); }
1605          count = current_state->count;  /* Number already matched */
1606          if (clen > 0)
1607            {
1608            BOOL OK;
1609            switch (c)
1610              {
1611            case 0x000a:            case 0x000a:
1612            case 0x000b:            case 0x000b:
1613            case 0x000c:            case 0x000c:
1614              case 0x000d:
1615            case 0x0085:            case 0x0085:
1616            case 0x2028:            case 0x2028:
1617            case 0x2029:            case 0x2029:
1618            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1619              break;
1620    
1621              default:
1622              OK = FALSE;
1623              }
1624    
1625            if (OK == (d == OP_VSPACE))
1626              {
1627              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1628              {              {
1629              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1630              next_active_state--;              next_active_state--;
1631              }              }
1632            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1633              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1634            else            else
1635              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1636              }
1637            }
1638          break;
1639    
1640          /*-----------------------------------------------------------------*/
1641          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1642          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1643          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1644          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1645          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1646            { ADD_ACTIVE(state_offset + 4, 0); }
1647          count = current_state->count;  /* Number already matched */
1648          if (clen > 0)
1649            {
1650            BOOL OK;
1651            switch (c)
1652              {
1653              case 0x09:      /* HT */
1654              case 0x20:      /* SPACE */
1655              case 0xa0:      /* NBSP */
1656              case 0x1680:    /* OGHAM SPACE MARK */
1657              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1658              case 0x2000:    /* EN QUAD */
1659              case 0x2001:    /* EM QUAD */
1660              case 0x2002:    /* EN SPACE */
1661              case 0x2003:    /* EM SPACE */
1662              case 0x2004:    /* THREE-PER-EM SPACE */
1663              case 0x2005:    /* FOUR-PER-EM SPACE */
1664              case 0x2006:    /* SIX-PER-EM SPACE */
1665              case 0x2007:    /* FIGURE SPACE */
1666              case 0x2008:    /* PUNCTUATION SPACE */
1667              case 0x2009:    /* THIN SPACE */
1668              case 0x200A:    /* HAIR SPACE */
1669              case 0x202f:    /* NARROW NO-BREAK SPACE */
1670              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1671              case 0x3000:    /* IDEOGRAPHIC SPACE */
1672              OK = TRUE;
1673            break;            break;
1674    
1675            default:            default:
1676              OK = FALSE;
1677            break;            break;
1678            }            }
1679    
1680            if (OK == (d == OP_HSPACE))
1681              {
1682              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1683                {
1684                active_count--;           /* Remove non-match possibility */
1685                next_active_state--;
1686                }
1687              if (++count >= GET2(code, 1))
1688                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1689              else
1690                { ADD_NEW_DATA(-state_offset, count, 0); }
1691              }
1692          }          }
1693        break;        break;
1694    
# Line 1429  for (;;) Line 1769  for (;;)
1769        case OP_ANYNL:        case OP_ANYNL:
1770        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1771          {          {
         case 0x000a:  
1772          case 0x000b:          case 0x000b:
1773          case 0x000c:          case 0x000c:
1774          case 0x0085:          case 0x0085:
1775          case 0x2028:          case 0x2028:
1776          case 0x2029:          case 0x2029:
1777            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1778    
1779            case 0x000a:
1780          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1781          break;          break;
1782    
1783          case 0x000d:          case 0x000d:
1784          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1785            {            {
# Line 1451  for (;;) Line 1794  for (;;)
1794        break;        break;
1795    
1796        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1797          case OP_NOT_VSPACE:
1798          if (clen > 0) switch(c)
1799            {
1800            case 0x000a:
1801            case 0x000b:
1802            case 0x000c:
1803            case 0x000d:
1804            case 0x0085:
1805            case 0x2028:
1806            case 0x2029:
1807            break;
1808    
1809            default:
1810            ADD_NEW(state_offset + 1, 0);
1811            break;
1812            }
1813          break;
1814    
1815          /*-----------------------------------------------------------------*/
1816          case OP_VSPACE:
1817          if (clen > 0) switch(c)
1818            {
1819            case 0x000a:
1820            case 0x000b:
1821            case 0x000c:
1822            case 0x000d:
1823            case 0x0085:
1824            case 0x2028:
1825            case 0x2029:
1826            ADD_NEW(state_offset + 1, 0);
1827            break;
1828    
1829            default: break;
1830            }
1831          break;
1832    
1833          /*-----------------------------------------------------------------*/
1834          case OP_NOT_HSPACE:
1835          if (clen > 0) switch(c)
1836            {
1837            case 0x09:      /* HT */
1838            case 0x20:      /* SPACE */
1839            case 0xa0:      /* NBSP */
1840            case 0x1680:    /* OGHAM SPACE MARK */
1841            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1842            case 0x2000:    /* EN QUAD */
1843            case 0x2001:    /* EM QUAD */
1844            case 0x2002:    /* EN SPACE */
1845            case 0x2003:    /* EM SPACE */
1846            case 0x2004:    /* THREE-PER-EM SPACE */
1847            case 0x2005:    /* FOUR-PER-EM SPACE */
1848            case 0x2006:    /* SIX-PER-EM SPACE */
1849            case 0x2007:    /* FIGURE SPACE */
1850            case 0x2008:    /* PUNCTUATION SPACE */
1851            case 0x2009:    /* THIN SPACE */
1852            case 0x200A:    /* HAIR SPACE */
1853            case 0x202f:    /* NARROW NO-BREAK SPACE */
1854            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1855            case 0x3000:    /* IDEOGRAPHIC SPACE */
1856            break;
1857    
1858            default:
1859            ADD_NEW(state_offset + 1, 0);
1860            break;
1861            }
1862          break;
1863    
1864          /*-----------------------------------------------------------------*/
1865          case OP_HSPACE:
1866          if (clen > 0) switch(c)
1867            {
1868            case 0x09:      /* HT */
1869            case 0x20:      /* SPACE */
1870            case 0xa0:      /* NBSP */
1871            case 0x1680:    /* OGHAM SPACE MARK */
1872            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1873            case 0x2000:    /* EN QUAD */
1874            case 0x2001:    /* EM QUAD */
1875            case 0x2002:    /* EN SPACE */
1876            case 0x2003:    /* EM SPACE */
1877            case 0x2004:    /* THREE-PER-EM SPACE */
1878            case 0x2005:    /* FOUR-PER-EM SPACE */
1879            case 0x2006:    /* SIX-PER-EM SPACE */
1880            case 0x2007:    /* FIGURE SPACE */
1881            case 0x2008:    /* PUNCTUATION SPACE */
1882            case 0x2009:    /* THIN SPACE */
1883            case 0x200A:    /* HAIR SPACE */
1884            case 0x202f:    /* NARROW NO-BREAK SPACE */
1885            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1886            case 0x3000:    /* IDEOGRAPHIC SPACE */
1887            ADD_NEW(state_offset + 1, 0);
1888            break;
1889            }
1890          break;
1891    
1892          /*-----------------------------------------------------------------*/
1893        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1894        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1895        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 2057  is not anchored. Line 2496  is not anchored.
2496    
2497  Arguments:  Arguments:
2498    argument_re     points to the compiled expression    argument_re     points to the compiled expression
2499    extra_data      points to extra data or is NULL (not currently used)    extra_data      points to extra data or is NULL
2500    subject         points to the subject string    subject         points to the subject string
2501    length          length of subject string (may contain binary zeros)    length          length of subject string (may contain binary zeros)
2502    start_offset    where to start in the subject string    start_offset    where to start in the subject string
# Line 2073  Returns: > 0 => number of match Line 2512  Returns: > 0 => number of match
2512                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2513  */  */
2514    
2515  PCRE_DATA_SCOPE int  PCRE_EXP_DEFN int
2516  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2517    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2518    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2163  md->end_subject = end_subject; Line 2602  md->end_subject = end_subject;
2602  md->moptions = options;  md->moptions = options;
2603  md->poptions = re->options;  md->poptions = re->options;
2604    
2605  /* Handle different types of newline. The two bits give four cases. If nothing  /* If the BSR option is not set at match time, copy what was set
2606  is set at run time, whatever was used at compile time applies. */  at compile time. */
2607    
2608  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &  if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2609      {
2610      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2611        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2612    #ifdef BSR_ANYCRLF
2613      else md->moptions |= PCRE_BSR_ANYCRLF;
2614    #endif
2615      }
2616    
2617    /* Handle different types of newline. The three bits give eight cases. If
2618    nothing is set at run time, whatever was used at compile time applies. */
2619    
2620    switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2621           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2622    {    {
2623    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
# Line 2175  switch ((((options & PCRE_NEWLINE_BITS) Line 2626  switch ((((options & PCRE_NEWLINE_BITS)
2626    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2627         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2628    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2629      case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2630    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
2631    }    }
2632    
2633  if (newline < 0)  if (newline == -2)
2634      {
2635      md->nltype = NLTYPE_ANYCRLF;
2636      }
2637    else if (newline < 0)
2638    {    {
2639    md->nltype = NLTYPE_ANY;    md->nltype = NLTYPE_ANY;
2640    }    }
# Line 2228  if (md->tables == NULL) md->tables = _pc Line 2684  if (md->tables == NULL) md->tables = _pc
2684  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2685    
2686  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2687  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2688  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2689    
2690  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2239  studied, there may be a bitmap of possib Line 2695  studied, there may be a bitmap of possib
2695    
2696  if (!anchored)  if (!anchored)
2697    {    {
2698    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2699      {      {
2700      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2701      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2256  if (!anchored) Line 2712  if (!anchored)
2712  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2713  character" set. */  character" set. */
2714    
2715  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2716    {    {
2717    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2718    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2308  for (;;) Line 2764  for (;;)
2764          {          {
2765          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2766            current_subject++;            current_subject++;
2767    
2768            /* If we have just passed a CR and the newline option is ANY or
2769            ANYCRLF, and we are now at a LF, advance the match position by one more
2770            character. */
2771    
2772            if (current_subject[-1] == '\r' &&
2773                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2774                 current_subject < end_subject &&
2775                 *current_subject == '\n')
2776              current_subject++;
2777          }          }
2778        }        }
2779    
# Line 2416  for (;;) Line 2882  for (;;)
2882      }      }
2883    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2884    
2885    /* If we have just passed a CR and the newline option is CRLF or ANY, and we    /* If we have just passed a CR and we are now at a LF, and the pattern does
2886    are now at a LF, advance the match position by one more character. */    not contain any explicit matches for \r or \n, and the newline option is CRLF
2887      or ANY or ANYCRLF, advance the match position by one more character. */
2888    
2889    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == '\r' &&
2890         (md->nltype == NLTYPE_ANY || md->nllen == 2) &&        current_subject < end_subject &&
2891         current_subject < end_subject &&        *current_subject == '\n' &&
2892         *current_subject == '\n')        (re->flags & PCRE_HASCRORLF) == 0 &&
2893            (md->nltype == NLTYPE_ANY ||
2894             md->nltype == NLTYPE_ANYCRLF ||
2895             md->nllen == 2))
2896      current_subject++;      current_subject++;
2897    
2898    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.96  
changed lines
  Added in v.335

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12