/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 168 by ph10, Tue May 29 15:18:18 2007 UTC revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    #ifdef HAVE_CONFIG_H
49    #include "config.h"
50    #endif
51    
52  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
53  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
54  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
67    
68  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes  /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69  into others, under special conditions. A gap of 20 between the blocks should be  into others, under special conditions. A gap of 20 between the blocks should be
70  enough. */  enough. The resulting opcodes don't have to be less than 256 because they are
71    never stored, so we push them well clear of the normal opcodes. */
72    
73  #define OP_PROP_EXTRA 100  #define OP_PROP_EXTRA       300
74  #define OP_EXTUNI_EXTRA 120  #define OP_EXTUNI_EXTRA     320
75  #define OP_ANYNL_EXTRA 140  #define OP_ANYNL_EXTRA      340
76    #define OP_HSPACE_EXTRA     360
77    #define OP_VSPACE_EXTRA     380
78    
79    
80  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
81  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes is possible to
82  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
83  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. ***NOTE*** If the start of this table is modified, the two tables
85  that follow must also be modified. */  that follow must also be modified. */
86    
87  static uschar coptable[] = {  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
92    0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93      0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
95    1,                             /* Char                                   */    1,                             /* Char                                   */
96    1,                             /* Charnc                                 */    1,                             /* Charnc                                 */
# Line 122  static uschar coptable[] = { Line 130  static uschar coptable[] = {
130    0,                             /* CREF                                   */    0,                             /* CREF                                   */
131    0,                             /* RREF                                   */    0,                             /* RREF                                   */
132    0,                             /* DEF                                    */    0,                             /* DEF                                    */
133    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134      0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135      0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139  and \w */  and \w */
140    
141  static uschar toptable1[] = {  static const uschar toptable1[] = {
142    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
147  };  };
148    
149  static uschar toptable2[] = {  static const uschar toptable2[] = {
150    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
154    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
155  };  };
156    
157    
# Line 213  Arguments: Line 223  Arguments:
223    rlevel            function call recursion level    rlevel            function call recursion level
224    recursing         regex recursive call level    recursing         regex recursive call level
225    
226  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
227                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
228                       -1 => failed to match                       -1 => failed to match
229                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
230    
# Line 501  for (;;) Line 511  for (;;)
511      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
512      const uschar *code;      const uschar *code;
513      int state_offset = current_state->offset;      int state_offset = current_state->offset;
514      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
515    
516  #ifdef DEBUG  #ifdef DEBUG
517      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 559  for (;;) Line 566  for (;;)
566      permitted.      permitted.
567    
568      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an      We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
569      argument that is not a data character - but is always one byte long.      argument that is not a data character - but is always one byte long. We
570      Unfortunately, we have to take special action to deal with  \P, \p, and      have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
571      \X in this case. To keep the other cases fast, convert these ones to new      this case. To keep the other cases fast, convert these ones to new opcodes.
572      opcodes. */      */
573    
574      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
575        {        {
# Line 580  for (;;) Line 587  for (;;)
587            case OP_PROP: codevalue += OP_PROP_EXTRA; break;            case OP_PROP: codevalue += OP_PROP_EXTRA; break;
588            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;            case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;            case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590              case OP_NOT_HSPACE:
591              case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592              case OP_NOT_VSPACE:
593              case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594            default: break;            default: break;
595            }            }
596          }          }
# Line 680  for (;;) Line 691  for (;;)
691        break;        break;
692    
693        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
694          case OP_SKIPZERO:
695          code += 1 + GET(code, 2);
696          while (*code == OP_ALT) code += GET(code, 1);
697          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698          break;
699    
700          /*-----------------------------------------------------------------*/
701        case OP_CIRC:        case OP_CIRC:
702        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 718  for (;;) Line 736  for (;;)
736    
737        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
738        case OP_ANY:        case OP_ANY:
739        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
740            { ADD_NEW(state_offset + 1, 0); }
741          break;
742    
743          /*-----------------------------------------------------------------*/
744          case OP_ALLANY:
745          if (clen > 0)
746          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
747        break;        break;
748    
# Line 733  for (;;) Line 757  for (;;)
757        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
758          {          {
759          if (clen == 0 ||          if (clen == 0 ||
760              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
761                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762              ))              ))
763            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 798  for (;;) Line 822  for (;;)
822        if (clen > 0)        if (clen > 0)
823          {          {
824          BOOL OK;          BOOL OK;
825          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
826          switch(code[1])          switch(code[1])
827            {            {
828            case PT_ANY:            case PT_ANY:
# Line 806  for (;;) Line 830  for (;;)
830            break;            break;
831    
832            case PT_LAMP:            case PT_LAMP:
833            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
834            break;            break;
835    
836            case PT_GC:            case PT_GC:
837            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
838            break;            break;
839    
840            case PT_PC:            case PT_PC:
841            OK = chartype == code[2];            OK = prop->chartype == code[2];
842            break;            break;
843    
844            case PT_SC:            case PT_SC:
845            OK = script == code[2];            OK = prop->script == code[2];
846            break;            break;
847    
848            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 838  for (;;) Line 862  for (;;)
862  /* ========================================================================== */  /* ========================================================================== */
863        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
864        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
865        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867    
868        case OP_TYPEPLUS:        case OP_TYPEPLUS:
869        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 850  for (;;) Line 874  for (;;)
874          {          {
875          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876              (c < 256 &&              (c < 256 &&
877                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
878                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879            {            {
880            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 876  for (;;) Line 897  for (;;)
897          {          {
898          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899              (c < 256 &&              (c < 256 &&
900                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
901                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902            {            {
903            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 901  for (;;) Line 919  for (;;)
919          {          {
920          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921              (c < 256 &&              (c < 256 &&
922                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
923                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924            {            {
925            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 924  for (;;) Line 939  for (;;)
939          {          {
940          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941              (c < 256 &&              (c < 256 &&
942                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
943                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944            {            {
945            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 948  for (;;) Line 960  for (;;)
960          {          {
961          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962              (c < 256 &&              (c < 256 &&
963                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
964                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965            {            {
966            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 982  for (;;) Line 991  for (;;)
991        if (clen > 0)        if (clen > 0)
992          {          {
993          BOOL OK;          BOOL OK;
994          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
995          switch(code[2])          switch(code[2])
996            {            {
997            case PT_ANY:            case PT_ANY:
# Line 990  for (;;) Line 999  for (;;)
999            break;            break;
1000    
1001            case PT_LAMP:            case PT_LAMP:
1002            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1003            break;            break;
1004    
1005            case PT_GC:            case PT_GC:
1006            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1007            break;            break;
1008    
1009            case PT_PC:            case PT_PC:
1010            OK = chartype == code[3];            OK = prop->chartype == code[3];
1011            break;            break;
1012    
1013            case PT_SC:            case PT_SC:
1014            OK = script == code[3];            OK = prop->script == code[3];
1015            break;            break;
1016    
1017            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1031  for (;;) Line 1040  for (;;)
1040        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1042        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044          {          {
1045          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1046          int ncount = 0;          int ncount = 0;
# Line 1045  for (;;) Line 1054  for (;;)
1054            int nd;            int nd;
1055            int ndlen = 1;            int ndlen = 1;
1056            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1057            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1058            ncount++;            ncount++;
1059            nptr += ndlen;            nptr += ndlen;
1060            }            }
# Line 1066  for (;;) Line 1075  for (;;)
1075          int ncount = 0;          int ncount = 0;
1076          switch (c)          switch (c)
1077            {            {
1078              case 0x000b:
1079              case 0x000c:
1080              case 0x0085:
1081              case 0x2028:
1082              case 0x2029:
1083              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084              goto ANYNL01;
1085    
1086            case 0x000d:            case 0x000d:
1087            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088            /* Fall through */            /* Fall through */
1089    
1090              ANYNL01:
1091              case 0x000a:
1092              if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093                {
1094                active_count--;           /* Remove non-match possibility */
1095                next_active_state--;
1096                }
1097              count++;
1098              ADD_NEW_DATA(-state_offset, count, ncount);
1099              break;
1100    
1101              default:
1102              break;
1103              }
1104            }
1105          break;
1106    
1107          /*-----------------------------------------------------------------*/
1108          case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109          case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110          case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111          count = current_state->count;  /* Already matched */
1112          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113          if (clen > 0)
1114            {
1115            BOOL OK;
1116            switch (c)
1117              {
1118            case 0x000a:            case 0x000a:
1119            case 0x000b:            case 0x000b:
1120            case 0x000c:            case 0x000c:
1121              case 0x000d:
1122            case 0x0085:            case 0x0085:
1123            case 0x2028:            case 0x2028:
1124            case 0x2029:            case 0x2029:
1125            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            OK = TRUE;
1126              break;
1127    
1128              default:
1129              OK = FALSE;
1130              break;
1131              }
1132    
1133            if (OK == (d == OP_VSPACE))
1134              {
1135              if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136              {              {
1137              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1138              next_active_state--;              next_active_state--;
1139              }              }
1140            count++;            count++;
1141            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, 0);
1142              }
1143            }
1144          break;
1145    
1146          /*-----------------------------------------------------------------*/
1147          case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148          case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149          case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150          count = current_state->count;  /* Already matched */
1151          if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152          if (clen > 0)
1153            {
1154            BOOL OK;
1155            switch (c)
1156              {
1157              case 0x09:      /* HT */
1158              case 0x20:      /* SPACE */
1159              case 0xa0:      /* NBSP */
1160              case 0x1680:    /* OGHAM SPACE MARK */
1161              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1162              case 0x2000:    /* EN QUAD */
1163              case 0x2001:    /* EM QUAD */
1164              case 0x2002:    /* EN SPACE */
1165              case 0x2003:    /* EM SPACE */
1166              case 0x2004:    /* THREE-PER-EM SPACE */
1167              case 0x2005:    /* FOUR-PER-EM SPACE */
1168              case 0x2006:    /* SIX-PER-EM SPACE */
1169              case 0x2007:    /* FIGURE SPACE */
1170              case 0x2008:    /* PUNCTUATION SPACE */
1171              case 0x2009:    /* THIN SPACE */
1172              case 0x200A:    /* HAIR SPACE */
1173              case 0x202f:    /* NARROW NO-BREAK SPACE */
1174              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1175              case 0x3000:    /* IDEOGRAPHIC SPACE */
1176              OK = TRUE;
1177            break;            break;
1178    
1179            default:            default:
1180              OK = FALSE;
1181            break;            break;
1182            }            }
1183    
1184            if (OK == (d == OP_HSPACE))
1185              {
1186              if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187                {
1188                active_count--;           /* Remove non-match possibility */
1189                next_active_state--;
1190                }
1191              count++;
1192              ADD_NEW_DATA(-state_offset, count, 0);
1193              }
1194          }          }
1195        break;        break;
1196    
# Line 1108  for (;;) Line 1213  for (;;)
1213        if (clen > 0)        if (clen > 0)
1214          {          {
1215          BOOL OK;          BOOL OK;
1216          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1217          switch(code[2])          switch(code[2])
1218            {            {
1219            case PT_ANY:            case PT_ANY:
# Line 1116  for (;;) Line 1221  for (;;)
1221            break;            break;
1222    
1223            case PT_LAMP:            case PT_LAMP:
1224            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1225            break;            break;
1226    
1227            case PT_GC:            case PT_GC:
1228            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1229            break;            break;
1230    
1231            case PT_PC:            case PT_PC:
1232            OK = chartype == code[3];            OK = prop->chartype == code[3];
1233            break;            break;
1234    
1235            case PT_SC:            case PT_SC:
1236            OK = script == code[3];            OK = prop->script == code[3];
1237            break;            break;
1238    
1239            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1166  for (;;) Line 1271  for (;;)
1271        QS2:        QS2:
1272    
1273        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1274        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275          {          {
1276          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1277          int ncount = 0;          int ncount = 0;
# Line 1181  for (;;) Line 1286  for (;;)
1286            int nd;            int nd;
1287            int ndlen = 1;            int ndlen = 1;
1288            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1289            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1290            ncount++;            ncount++;
1291            nptr += ndlen;            nptr += ndlen;
1292            }            }
# Line 1209  for (;;) Line 1314  for (;;)
1314          int ncount = 0;          int ncount = 0;
1315          switch (c)          switch (c)
1316            {            {
1317              case 0x000b:
1318              case 0x000c:
1319              case 0x0085:
1320              case 0x2028:
1321              case 0x2029:
1322              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323              goto ANYNL02;
1324    
1325            case 0x000d:            case 0x000d:
1326            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327            /* Fall through */            /* Fall through */
1328    
1329              ANYNL02:
1330              case 0x000a:
1331              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332                  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333                {
1334                active_count--;           /* Remove non-match possibility */
1335                next_active_state--;
1336                }
1337              ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338              break;
1339    
1340              default:
1341              break;
1342              }
1343            }
1344          break;
1345    
1346          /*-----------------------------------------------------------------*/
1347          case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348          case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349          case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350          count = 2;
1351          goto QS4;
1352    
1353          case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354          case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355          case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356          count = 0;
1357    
1358          QS4:
1359          ADD_ACTIVE(state_offset + 2, 0);
1360          if (clen > 0)
1361            {
1362            BOOL OK;
1363            switch (c)
1364              {
1365            case 0x000a:            case 0x000a:
1366            case 0x000b:            case 0x000b:
1367            case 0x000c:            case 0x000c:
1368              case 0x000d:
1369            case 0x0085:            case 0x0085:
1370            case 0x2028:            case 0x2028:
1371            case 0x2029:            case 0x2029:
1372            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            OK = TRUE;
1373                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)            break;
1374    
1375              default:
1376              OK = FALSE;
1377              break;
1378              }
1379            if (OK == (d == OP_VSPACE))
1380              {
1381              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382                  codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383              {              {
1384              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1385              next_active_state--;              next_active_state--;
1386              }              }
1387            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388              }
1389            }
1390          break;
1391    
1392          /*-----------------------------------------------------------------*/
1393          case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394          case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395          case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396          count = 2;
1397          goto QS5;
1398    
1399          case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400          case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401          case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402          count = 0;
1403    
1404          QS5:
1405          ADD_ACTIVE(state_offset + 2, 0);
1406          if (clen > 0)
1407            {
1408            BOOL OK;
1409            switch (c)
1410              {
1411              case 0x09:      /* HT */
1412              case 0x20:      /* SPACE */
1413              case 0xa0:      /* NBSP */
1414              case 0x1680:    /* OGHAM SPACE MARK */
1415              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1416              case 0x2000:    /* EN QUAD */
1417              case 0x2001:    /* EM QUAD */
1418              case 0x2002:    /* EN SPACE */
1419              case 0x2003:    /* EM SPACE */
1420              case 0x2004:    /* THREE-PER-EM SPACE */
1421              case 0x2005:    /* FOUR-PER-EM SPACE */
1422              case 0x2006:    /* SIX-PER-EM SPACE */
1423              case 0x2007:    /* FIGURE SPACE */
1424              case 0x2008:    /* PUNCTUATION SPACE */
1425              case 0x2009:    /* THIN SPACE */
1426              case 0x200A:    /* HAIR SPACE */
1427              case 0x202f:    /* NARROW NO-BREAK SPACE */
1428              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1429              case 0x3000:    /* IDEOGRAPHIC SPACE */
1430              OK = TRUE;
1431            break;            break;
1432    
1433            default:            default:
1434              OK = FALSE;
1435            break;            break;
1436            }            }
1437    
1438            if (OK == (d == OP_HSPACE))
1439              {
1440              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441                  codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442                {
1443                active_count--;           /* Remove non-match possibility */
1444                next_active_state--;
1445                }
1446              ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447              }
1448          }          }
1449        break;        break;
1450    
# Line 1244  for (;;) Line 1460  for (;;)
1460        if (clen > 0)        if (clen > 0)
1461          {          {
1462          BOOL OK;          BOOL OK;
1463          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1464          switch(code[4])          switch(code[4])
1465            {            {
1466            case PT_ANY:            case PT_ANY:
# Line 1252  for (;;) Line 1468  for (;;)
1468            break;            break;
1469    
1470            case PT_LAMP:            case PT_LAMP:
1471            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1472            break;            break;
1473    
1474            case PT_GC:            case PT_GC:
1475            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1476            break;            break;
1477    
1478            case PT_PC:            case PT_PC:
1479            OK = chartype == code[5];            OK = prop->chartype == code[5];
1480            break;            break;
1481    
1482            case PT_SC:            case PT_SC:
1483            OK = script == code[5];            OK = prop->script == code[5];
1484            break;            break;
1485    
1486            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1297  for (;;) Line 1513  for (;;)
1513        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1515        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1516        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517          {          {
1518          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1519          int ncount = 0;          int ncount = 0;
# Line 1311  for (;;) Line 1527  for (;;)
1527            int nd;            int nd;
1528            int ndlen = 1;            int ndlen = 1;
1529            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1530            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1531            ncount++;            ncount++;
1532            nptr += ndlen;            nptr += ndlen;
1533            }            }
# Line 1336  for (;;) Line 1552  for (;;)
1552          int ncount = 0;          int ncount = 0;
1553          switch (c)          switch (c)
1554            {            {
1555              case 0x000b:
1556              case 0x000c:
1557              case 0x0085:
1558              case 0x2028:
1559              case 0x2029:
1560              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561              goto ANYNL03;
1562    
1563            case 0x000d:            case 0x000d:
1564            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;            if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565            /* Fall through */            /* Fall through */
1566    
1567              ANYNL03:
1568              case 0x000a:
1569              if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570                {
1571                active_count--;           /* Remove non-match possibility */
1572                next_active_state--;
1573                }
1574              if (++count >= GET2(code, 1))
1575                { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576              else
1577                { ADD_NEW_DATA(-state_offset, count, ncount); }
1578              break;
1579    
1580              default:
1581              break;
1582              }
1583            }
1584          break;
1585    
1586          /*-----------------------------------------------------------------*/
1587          case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588          case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589          case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590          case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591          if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592            { ADD_ACTIVE(state_offset + 4, 0); }
1593          count = current_state->count;  /* Number already matched */
1594          if (clen > 0)
1595            {
1596            BOOL OK;
1597            switch (c)
1598              {
1599            case 0x000a:            case 0x000a:
1600            case 0x000b:            case 0x000b:
1601            case 0x000c:            case 0x000c:
1602              case 0x000d:
1603            case 0x0085:            case 0x0085:
1604            case 0x2028:            case 0x2028:
1605            case 0x2029:            case 0x2029:
1606            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            OK = TRUE;
1607              break;
1608    
1609              default:
1610              OK = FALSE;
1611              }
1612    
1613            if (OK == (d == OP_VSPACE))
1614              {
1615              if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616              {              {
1617              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
1618              next_active_state--;              next_active_state--;
1619              }              }
1620            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1621              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622            else            else
1623              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1624              }
1625            }
1626          break;
1627    
1628          /*-----------------------------------------------------------------*/
1629          case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630          case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631          case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632          case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633          if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634            { ADD_ACTIVE(state_offset + 4, 0); }
1635          count = current_state->count;  /* Number already matched */
1636          if (clen > 0)
1637            {
1638            BOOL OK;
1639            switch (c)
1640              {
1641              case 0x09:      /* HT */
1642              case 0x20:      /* SPACE */
1643              case 0xa0:      /* NBSP */
1644              case 0x1680:    /* OGHAM SPACE MARK */
1645              case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1646              case 0x2000:    /* EN QUAD */
1647              case 0x2001:    /* EM QUAD */
1648              case 0x2002:    /* EN SPACE */
1649              case 0x2003:    /* EM SPACE */
1650              case 0x2004:    /* THREE-PER-EM SPACE */
1651              case 0x2005:    /* FOUR-PER-EM SPACE */
1652              case 0x2006:    /* SIX-PER-EM SPACE */
1653              case 0x2007:    /* FIGURE SPACE */
1654              case 0x2008:    /* PUNCTUATION SPACE */
1655              case 0x2009:    /* THIN SPACE */
1656              case 0x200A:    /* HAIR SPACE */
1657              case 0x202f:    /* NARROW NO-BREAK SPACE */
1658              case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1659              case 0x3000:    /* IDEOGRAPHIC SPACE */
1660              OK = TRUE;
1661            break;            break;
1662    
1663            default:            default:
1664              OK = FALSE;
1665            break;            break;
1666            }            }
1667    
1668            if (OK == (d == OP_HSPACE))
1669              {
1670              if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671                {
1672                active_count--;           /* Remove non-match possibility */
1673                next_active_state--;
1674                }
1675              if (++count >= GET2(code, 1))
1676                { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677              else
1678                { ADD_NEW_DATA(-state_offset, count, 0); }
1679              }
1680          }          }
1681        break;        break;
1682    
# Line 1388  for (;;) Line 1707  for (;;)
1707            other case of the character. */            other case of the character. */
1708    
1709  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1710            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1711  #else  #else
1712            othercase = NOTACHAR;            othercase = NOTACHAR;
1713  #endif  #endif
# Line 1413  for (;;) Line 1732  for (;;)
1732        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1733    
1734        case OP_EXTUNI:        case OP_EXTUNI:
1735        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736          {          {
1737          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1738          int ncount = 0;          int ncount = 0;
# Line 1421  for (;;) Line 1740  for (;;)
1740            {            {
1741            int nclen = 1;            int nclen = 1;
1742            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1743            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1744            ncount++;            ncount++;
1745            nptr += nclen;            nptr += nclen;
1746            }            }
# Line 1438  for (;;) Line 1757  for (;;)
1757        case OP_ANYNL:        case OP_ANYNL:
1758        if (clen > 0) switch(c)        if (clen > 0) switch(c)
1759          {          {
         case 0x000a:  
1760          case 0x000b:          case 0x000b:
1761          case 0x000c:          case 0x000c:
1762          case 0x0085:          case 0x0085:
1763          case 0x2028:          case 0x2028:
1764          case 0x2029:          case 0x2029:
1765            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766    
1767            case 0x000a:
1768          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
1769          break;          break;
1770    
1771          case 0x000d:          case 0x000d:
1772          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773            {            {
# Line 1460  for (;;) Line 1782  for (;;)
1782        break;        break;
1783    
1784        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1785          case OP_NOT_VSPACE:
1786          if (clen > 0) switch(c)
1787            {
1788            case 0x000a:
1789            case 0x000b:
1790            case 0x000c:
1791            case 0x000d:
1792            case 0x0085:
1793            case 0x2028:
1794            case 0x2029:
1795            break;
1796    
1797            default:
1798            ADD_NEW(state_offset + 1, 0);
1799            break;
1800            }
1801          break;
1802    
1803          /*-----------------------------------------------------------------*/
1804          case OP_VSPACE:
1805          if (clen > 0) switch(c)
1806            {
1807            case 0x000a:
1808            case 0x000b:
1809            case 0x000c:
1810            case 0x000d:
1811            case 0x0085:
1812            case 0x2028:
1813            case 0x2029:
1814            ADD_NEW(state_offset + 1, 0);
1815            break;
1816    
1817            default: break;
1818            }
1819          break;
1820    
1821          /*-----------------------------------------------------------------*/
1822          case OP_NOT_HSPACE:
1823          if (clen > 0) switch(c)
1824            {
1825            case 0x09:      /* HT */
1826            case 0x20:      /* SPACE */
1827            case 0xa0:      /* NBSP */
1828            case 0x1680:    /* OGHAM SPACE MARK */
1829            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1830            case 0x2000:    /* EN QUAD */
1831            case 0x2001:    /* EM QUAD */
1832            case 0x2002:    /* EN SPACE */
1833            case 0x2003:    /* EM SPACE */
1834            case 0x2004:    /* THREE-PER-EM SPACE */
1835            case 0x2005:    /* FOUR-PER-EM SPACE */
1836            case 0x2006:    /* SIX-PER-EM SPACE */
1837            case 0x2007:    /* FIGURE SPACE */
1838            case 0x2008:    /* PUNCTUATION SPACE */
1839            case 0x2009:    /* THIN SPACE */
1840            case 0x200A:    /* HAIR SPACE */
1841            case 0x202f:    /* NARROW NO-BREAK SPACE */
1842            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1843            case 0x3000:    /* IDEOGRAPHIC SPACE */
1844            break;
1845    
1846            default:
1847            ADD_NEW(state_offset + 1, 0);
1848            break;
1849            }
1850          break;
1851    
1852          /*-----------------------------------------------------------------*/
1853          case OP_HSPACE:
1854          if (clen > 0) switch(c)
1855            {
1856            case 0x09:      /* HT */
1857            case 0x20:      /* SPACE */
1858            case 0xa0:      /* NBSP */
1859            case 0x1680:    /* OGHAM SPACE MARK */
1860            case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1861            case 0x2000:    /* EN QUAD */
1862            case 0x2001:    /* EM QUAD */
1863            case 0x2002:    /* EN SPACE */
1864            case 0x2003:    /* EM SPACE */
1865            case 0x2004:    /* THREE-PER-EM SPACE */
1866            case 0x2005:    /* FOUR-PER-EM SPACE */
1867            case 0x2006:    /* SIX-PER-EM SPACE */
1868            case 0x2007:    /* FIGURE SPACE */
1869            case 0x2008:    /* PUNCTUATION SPACE */
1870            case 0x2009:    /* THIN SPACE */
1871            case 0x200A:    /* HAIR SPACE */
1872            case 0x202f:    /* NARROW NO-BREAK SPACE */
1873            case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1874            case 0x3000:    /* IDEOGRAPHIC SPACE */
1875            ADD_NEW(state_offset + 1, 0);
1876            break;
1877            }
1878          break;
1879    
1880          /*-----------------------------------------------------------------*/
1881        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character. This is only used for one-byte
1882        characters, that is, we know that d < 256. The character we are        characters, that is, we know that d < 256. The character we are
1883        checking (c) can be multibyte. */        checking (c) can be multibyte. */
# Line 1490  for (;;) Line 1908  for (;;)
1908            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1909              {              {
1910  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1911              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1912  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1913              }              }
1914            else            else
# Line 1528  for (;;) Line 1946  for (;;)
1946            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1947              {              {
1948  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1949              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1950  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1951              }              }
1952            else            else
# Line 1564  for (;;) Line 1982  for (;;)
1982            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1983              {              {
1984  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1985              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1986  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1987              }              }
1988            else            else
# Line 1596  for (;;) Line 2014  for (;;)
2014            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2015              {              {
2016  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2017              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2018  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2019              }              }
2020            else            else
# Line 1631  for (;;) Line 2049  for (;;)
2049            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2050              {              {
2051  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2052              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2053  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2054              }              }
2055            else            else
# Line 1739  for (;;) Line 2157  for (;;)
2157    
2158  /* ========================================================================== */  /* ========================================================================== */
2159        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2160        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assersion
2161          (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162          though the other "backtracking verbs" are not supported. */
2163    
2164          case OP_FAIL:
2165          break;
2166    
2167        case OP_ASSERT:        case OP_ASSERT:
2168        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 1777  for (;;) Line 2200  for (;;)
2200          {          {
2201          int local_offsets[1000];          int local_offsets[1000];
2202          int local_workspace[1000];          int local_workspace[1000];
2203          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2204            int condcode;
2205    
2206            /* Because of the way auto-callout works during compile, a callout item
2207            is inserted between OP_COND and an assertion condition. This does not
2208            happen for the other conditions. */
2209    
2210            if (code[LINK_SIZE+1] == OP_CALLOUT)
2211              {
2212              rrc = 0;
2213              if (pcre_callout != NULL)
2214                {
2215                pcre_callout_block cb;
2216                cb.version          = 1;   /* Version 1 of the callout block */
2217                cb.callout_number   = code[LINK_SIZE+2];
2218                cb.offset_vector    = offsets;
2219                cb.subject          = (PCRE_SPTR)start_subject;
2220                cb.subject_length   = end_subject - start_subject;
2221                cb.start_match      = current_subject - start_subject;
2222                cb.current_position = ptr - start_subject;
2223                cb.pattern_position = GET(code, LINK_SIZE + 3);
2224                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225                cb.capture_top      = 1;
2226                cb.capture_last     = -1;
2227                cb.callout_data     = md->callout_data;
2228                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229                }
2230              if (rrc > 0) break;                      /* Fail this thread */
2231              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2232              }
2233    
2234            condcode = code[LINK_SIZE+1];
2235    
2236          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2237    
# Line 1786  for (;;) Line 2240  for (;;)
2240          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2241    
2242          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2243            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2244    
2245          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2246          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 1798  for (;;) Line 2250  for (;;)
2250            {            {
2251            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2252            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2253            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2254              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2255              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256            }            }
2257    
2258          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 1829  for (;;) Line 2282  for (;;)
2282                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2283              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2284            else            else
2285              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2286            }            }
2287          }          }
2288        break;        break;
# Line 1981  for (;;) Line 2434  for (;;)
2434        /* Handle callouts */        /* Handle callouts */
2435    
2436        case OP_CALLOUT:        case OP_CALLOUT:
2437          rrc = 0;
2438        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2439          {          {
         int rrc;  
2440          pcre_callout_block cb;          pcre_callout_block cb;
2441          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2442          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 1998  for (;;) Line 2451  for (;;)
2451          cb.capture_last     = -1;          cb.capture_last     = -1;
2452          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2453          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2454          }          }
2455          if (rrc == 0)
2456            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2457        break;        break;
2458    
2459    
# Line 2082  Returns: > 0 => number of match Line 2536  Returns: > 0 => number of match
2536                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2537  */  */
2538    
2539  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2540  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2541    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2542    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2172  md->end_subject = end_subject; Line 2626  md->end_subject = end_subject;
2626  md->moptions = options;  md->moptions = options;
2627  md->poptions = re->options;  md->poptions = re->options;
2628    
2629    /* If the BSR option is not set at match time, copy what was set
2630    at compile time. */
2631    
2632    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2633      {
2634      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2635        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2636    #ifdef BSR_ANYCRLF
2637      else md->moptions |= PCRE_BSR_ANYCRLF;
2638    #endif
2639      }
2640    
2641  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
2642  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
2643    
# Line 2179  switch ((((options & PCRE_NEWLINE_BITS) Line 2645  switch ((((options & PCRE_NEWLINE_BITS)
2645           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2646    {    {
2647    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2648    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2649    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2650    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2651         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2652    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2653    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2654    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2242  if (md->tables == NULL) md->tables = _pc Line 2708  if (md->tables == NULL) md->tables = _pc
2708  used in a loop when finding where to start. */  used in a loop when finding where to start. */
2709    
2710  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
2711  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
2712  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
2713    
2714  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2253  studied, there may be a bitmap of possib Line 2719  studied, there may be a bitmap of possib
2719    
2720  if (!anchored)  if (!anchored)
2721    {    {
2722    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
2723      {      {
2724      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
2725      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2270  if (!anchored) Line 2736  if (!anchored)
2736  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
2737  character" set. */  character" set. */
2738    
2739  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
2740    {    {
2741    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
2742    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2278  if ((re->options & PCRE_REQCHSET) != 0) Line 2744  if ((re->options & PCRE_REQCHSET) != 0)
2744    }    }
2745    
2746  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2747  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2748  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2749    
2750  for (;;)  for (;;)
2751    {    {
# Line 2290  for (;;) Line 2755  for (;;)
2755      {      {
2756      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2757    
2758      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2759      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2760      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2761      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2762    
2763      if (firstline)      if (firstline)
2764        {        {
2765        const uschar *t = current_subject;        USPTR t = current_subject;
2766    #ifdef SUPPORT_UTF8
2767          if (utf8)
2768            {
2769            while (t < md->end_subject && !IS_NEWLINE(t))
2770              {
2771              t++;
2772              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2773              }
2774            }
2775          else
2776    #endif
2777        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2778        end_subject = t;        end_subject = t;
2779        }        }
2780    
2781      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2782        starting point is not found, or if a known later character is not present.
2783        However, there is an option that disables these, for testing and for
2784        ensuring that all callouts do actually occur. */
2785    
2786        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2787        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2788    
2789      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2790    
2791      else if (startline)        if (first_byte >= 0)
       {  
       if (current_subject > md->start_subject + start_offset)  
2792          {          {
2793          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (first_byte_caseless)
2794            current_subject++;            while (current_subject < end_subject &&
2795                     lcc[*current_subject] != first_byte)
2796                current_subject++;
2797            else
2798              while (current_subject < end_subject &&
2799                     *current_subject != first_byte)
2800                current_subject++;
2801            }
2802    
2803          /* Or to just after a linebreak for a multiline match if possible */
2804    
2805          /* If we have just passed a CR and the newline option is ANY or        else if (startline)
2806          ANYCRLF, and we are now at a LF, advance the match position by one more          {
2807          character. */          if (current_subject > md->start_subject + start_offset)
2808              {
2809          if (current_subject[-1] == '\r' &&  #ifdef SUPPORT_UTF8
2810               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            if (utf8)
2811               current_subject < end_subject &&              {
2812               *current_subject == '\n')              while (current_subject < end_subject &&
2813            current_subject++;                     !WAS_NEWLINE(current_subject))
2814                  {
2815                  current_subject++;
2816                  while(current_subject < end_subject &&
2817                        (*current_subject & 0xc0) == 0x80)
2818                    current_subject++;
2819                  }
2820                }
2821              else
2822    #endif
2823              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2824                current_subject++;
2825    
2826              /* If we have just passed a CR and the newline option is ANY or
2827              ANYCRLF, and we are now at a LF, advance the match position by one
2828              more character. */
2829    
2830              if (current_subject[-1] == CHAR_CR &&
2831                   (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2832                   current_subject < end_subject &&
2833                   *current_subject == CHAR_NL)
2834                current_subject++;
2835              }
2836          }          }
       }  
2837    
2838      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2839    
2840      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2841          {          {
2842          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2843          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2844            else break;            register unsigned int c = *current_subject;
2845              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2846                else break;
2847              }
2848          }          }
2849        }        }
2850    
# Line 2365  for (;;) Line 2866  for (;;)
2866    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2867    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2868    
2869    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2870    */    also be explicitly deactivated. */
2871    
2872    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2873          req_byte >= 0 &&
2874        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2875        (options & PCRE_PARTIAL) == 0)        (options & PCRE_PARTIAL) == 0)
2876      {      {
# Line 2440  for (;;) Line 2942  for (;;)
2942      }      }
2943    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
2944    
2945    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
2946    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
2947    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2948    
2949    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2950         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
2951          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
2952          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2953         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
2954         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
2955             md->nllen == 2))
2956      current_subject++;      current_subject++;
2957    
2958    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.168  
changed lines
  Added in v.406

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12