/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 327 by ph10, Sat Mar 8 19:38:30 2008 UTC revision 435 by ph10, Sat Sep 5 10:20:44 2009 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 60  applications. */ Line 61  applications. */
61  #define SP "                   "  #define SP "                   "
62    
63    
   
64  /*************************************************  /*************************************************
65  *      Code parameters and static tables         *  *      Code parameters and static tables         *
66  *************************************************/  *************************************************/
# Line 88  static const uschar coptable[] = { Line 88  static const uschar coptable[] = {
88    0,                             /* End                                    */    0,                             /* End                                    */
89    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
90    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
91    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
92    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
93    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
94    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
# Line 132  static const uschar coptable[] = { Line 132  static const uschar coptable[] = {
132    0,                             /* DEF                                    */    0,                             /* DEF                                    */
133    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0,                          /* BRAZERO, BRAMINZERO                    */
134    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135    0, 0                           /* FAIL, ACCEPT                           */    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136  };  };
137    
138  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
# Line 143  static const uschar toptable1[] = { Line 143  static const uschar toptable1[] = {
143    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
144    ctype_space, ctype_space,    ctype_space, ctype_space,
145    ctype_word,  ctype_word,    ctype_word,  ctype_word,
146    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
147  };  };
148    
149  static const uschar toptable2[] = {  static const uschar toptable2[] = {
# Line 151  static const uschar toptable2[] = { Line 151  static const uschar toptable2[] = {
151    ctype_digit, 0,    ctype_digit, 0,
152    ctype_space, 0,    ctype_space, 0,
153    ctype_word,  0,    ctype_word,  0,
154    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
155  };  };
156    
157    
# Line 223  Arguments: Line 223  Arguments:
223    rlevel            function call recursion level    rlevel            function call recursion level
224    recursing         regex recursive call level    recursing         regex recursive call level
225    
226  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
227                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
228                       -1 => failed to match                       -1 => failed to match
229                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
230    
# Line 389  if (*first_op == OP_REVERSE) Line 389  if (*first_op == OP_REVERSE)
389        current_subject - start_subject : max_back;        current_subject - start_subject : max_back;
390      current_subject -= gone_back;      current_subject -= gone_back;
391      }      }
392    
393      /* Save the earliest consulted character */
394    
395      if (current_subject < md->start_used_ptr)
396        md->start_used_ptr = current_subject;
397    
398    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
399    
# Line 454  for (;;) Line 459  for (;;)
459    int i, j;    int i, j;
460    int clen, dlen;    int clen, dlen;
461    unsigned int c, d;    unsigned int c, d;
462      int forced_fail = 0;
463      int reached_end = 0;
464    
465    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
466    new state list. */    new state list. */
# Line 511  for (;;) Line 518  for (;;)
518      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
519      const uschar *code;      const uschar *code;
520      int state_offset = current_state->offset;      int state_offset = current_state->offset;
521      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
522    
523  #ifdef DEBUG  #ifdef DEBUG
524      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 627  for (;;) Line 631  for (;;)
631            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
632            }            }
633          }          }
634        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
635          {          {
636          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          reached_end++;    /* Count branches that reach the end */
637            else if (match_count > 0 && ++match_count * 2 >= offsetcount)          if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
638              match_count = 0;            {
639          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
640          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));              else if (match_count > 0 && ++match_count * 2 >= offsetcount)
641          if (offsetcount >= 2)                match_count = 0;
642            {            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
643            offsets[0] = current_subject - start_subject;            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
644            offsets[1] = ptr - start_subject;            if (offsetcount >= 2)
645            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,              {
646              offsets[1] - offsets[0], current_subject));              offsets[0] = current_subject - start_subject;
647            }              offsets[1] = ptr - start_subject;
648          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
649            {                offsets[1] - offsets[0], current_subject));
650            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              }
651              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
652              match_count, rlevel*2-2, SP));              {
653            return match_count;              DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
654            }                "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
655                  match_count, rlevel*2-2, SP));
656                return match_count;
657                }
658              }
659          }          }
660        break;        break;
661    
# Line 694  for (;;) Line 702  for (;;)
702        break;        break;
703    
704        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
705          case OP_SKIPZERO:
706          code += 1 + GET(code, 2);
707          while (*code == OP_ALT) code += GET(code, 1);
708          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
709          break;
710    
711          /*-----------------------------------------------------------------*/
712        case OP_CIRC:        case OP_CIRC:
713        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
714            ((ims & PCRE_MULTILINE) != 0 &&            ((ims & PCRE_MULTILINE) != 0 &&
# Line 732  for (;;) Line 747  for (;;)
747    
748        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
749        case OP_ANY:        case OP_ANY:
750        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
751            { ADD_NEW(state_offset + 1, 0); }
752          break;
753    
754          /*-----------------------------------------------------------------*/
755          case OP_ALLANY:
756          if (clen > 0)
757          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
758        break;        break;
759    
# Line 747  for (;;) Line 768  for (;;)
768        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
769          {          {
770          if (clen == 0 ||          if (clen == 0 ||
771              (IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
772                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
773              ))              ))
774            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 784  for (;;) Line 805  for (;;)
805          if (ptr > start_subject)          if (ptr > start_subject)
806            {            {
807            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
808              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
809  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
810            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
811  #endif  #endif
# Line 792  for (;;) Line 814  for (;;)
814            }            }
815          else left_word = 0;          else left_word = 0;
816    
817          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
818            else right_word = 0;            right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
819            else              /* This is a fudge to ensure that if this is the */
820              {               /* last item in the pattern, we don't count it as */
821              reached_end--;  /* reached, thus disabling a partial match. */
822              right_word = 0;
823              }
824    
825          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
826            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 812  for (;;) Line 839  for (;;)
839        if (clen > 0)        if (clen > 0)
840          {          {
841          BOOL OK;          BOOL OK;
842          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
843          switch(code[1])          switch(code[1])
844            {            {
845            case PT_ANY:            case PT_ANY:
# Line 820  for (;;) Line 847  for (;;)
847            break;            break;
848    
849            case PT_LAMP:            case PT_LAMP:
850            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
851            break;            break;
852    
853            case PT_GC:            case PT_GC:
854            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
855            break;            break;
856    
857            case PT_PC:            case PT_PC:
858            OK = chartype == code[2];            OK = prop->chartype == code[2];
859            break;            break;
860    
861            case PT_SC:            case PT_SC:
862            OK = script == code[2];            OK = prop->script == code[2];
863            break;            break;
864    
865            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 852  for (;;) Line 879  for (;;)
879  /* ========================================================================== */  /* ========================================================================== */
880        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
881        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
882        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
883        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
884    
885        case OP_TYPEPLUS:        case OP_TYPEPLUS:
886        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 864  for (;;) Line 891  for (;;)
891          {          {
892          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
893              (c < 256 &&              (c < 256 &&
894                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
895                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896            {            {
897            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 890  for (;;) Line 914  for (;;)
914          {          {
915          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
916              (c < 256 &&              (c < 256 &&
917                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
918                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
919            {            {
920            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 915  for (;;) Line 936  for (;;)
936          {          {
937          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938              (c < 256 &&              (c < 256 &&
939                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
940                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
941            {            {
942            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 938  for (;;) Line 956  for (;;)
956          {          {
957          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
958              (c < 256 &&              (c < 256 &&
959                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
960                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
961            {            {
962            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 962  for (;;) Line 977  for (;;)
977          {          {
978          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
979              (c < 256 &&              (c < 256 &&
980                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
981                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
982            {            {
983            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 996  for (;;) Line 1008  for (;;)
1008        if (clen > 0)        if (clen > 0)
1009          {          {
1010          BOOL OK;          BOOL OK;
1011          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1012          switch(code[2])          switch(code[2])
1013            {            {
1014            case PT_ANY:            case PT_ANY:
# Line 1004  for (;;) Line 1016  for (;;)
1016            break;            break;
1017    
1018            case PT_LAMP:            case PT_LAMP:
1019            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1020            break;            break;
1021    
1022            case PT_GC:            case PT_GC:
1023            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1024            break;            break;
1025    
1026            case PT_PC:            case PT_PC:
1027            OK = chartype == code[3];            OK = prop->chartype == code[3];
1028            break;            break;
1029    
1030            case PT_SC:            case PT_SC:
1031            OK = script == code[3];            OK = prop->script == code[3];
1032            break;            break;
1033    
1034            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1045  for (;;) Line 1057  for (;;)
1057        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1058        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1059        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1060        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1061          {          {
1062          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1063          int ncount = 0;          int ncount = 0;
# Line 1059  for (;;) Line 1071  for (;;)
1071            int nd;            int nd;
1072            int ndlen = 1;            int ndlen = 1;
1073            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1074            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1075            ncount++;            ncount++;
1076            nptr += ndlen;            nptr += ndlen;
1077            }            }
# Line 1218  for (;;) Line 1230  for (;;)
1230        if (clen > 0)        if (clen > 0)
1231          {          {
1232          BOOL OK;          BOOL OK;
1233          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1234          switch(code[2])          switch(code[2])
1235            {            {
1236            case PT_ANY:            case PT_ANY:
# Line 1226  for (;;) Line 1238  for (;;)
1238            break;            break;
1239    
1240            case PT_LAMP:            case PT_LAMP:
1241            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1242            break;            break;
1243    
1244            case PT_GC:            case PT_GC:
1245            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1246            break;            break;
1247    
1248            case PT_PC:            case PT_PC:
1249            OK = chartype == code[3];            OK = prop->chartype == code[3];
1250            break;            break;
1251    
1252            case PT_SC:            case PT_SC:
1253            OK = script == code[3];            OK = prop->script == code[3];
1254            break;            break;
1255    
1256            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1276  for (;;) Line 1288  for (;;)
1288        QS2:        QS2:
1289    
1290        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1291        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1292          {          {
1293          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1294          int ncount = 0;          int ncount = 0;
# Line 1291  for (;;) Line 1303  for (;;)
1303            int nd;            int nd;
1304            int ndlen = 1;            int ndlen = 1;
1305            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1306            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1307            ncount++;            ncount++;
1308            nptr += ndlen;            nptr += ndlen;
1309            }            }
# Line 1465  for (;;) Line 1477  for (;;)
1477        if (clen > 0)        if (clen > 0)
1478          {          {
1479          BOOL OK;          BOOL OK;
1480          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1481          switch(code[4])          switch(code[4])
1482            {            {
1483            case PT_ANY:            case PT_ANY:
# Line 1473  for (;;) Line 1485  for (;;)
1485            break;            break;
1486    
1487            case PT_LAMP:            case PT_LAMP:
1488            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1489            break;            break;
1490    
1491            case PT_GC:            case PT_GC:
1492            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1493            break;            break;
1494    
1495            case PT_PC:            case PT_PC:
1496            OK = chartype == code[5];            OK = prop->chartype == code[5];
1497            break;            break;
1498    
1499            case PT_SC:            case PT_SC:
1500            OK = script == code[5];            OK = prop->script == code[5];
1501            break;            break;
1502    
1503            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1518  for (;;) Line 1530  for (;;)
1530        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1531          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1532        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1533        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1534          {          {
1535          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1536          int ncount = 0;          int ncount = 0;
# Line 1532  for (;;) Line 1544  for (;;)
1544            int nd;            int nd;
1545            int ndlen = 1;            int ndlen = 1;
1546            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1547            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1548            ncount++;            ncount++;
1549            nptr += ndlen;            nptr += ndlen;
1550            }            }
# Line 1712  for (;;) Line 1724  for (;;)
1724            other case of the character. */            other case of the character. */
1725    
1726  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1727            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
1728  #else  #else
1729            othercase = NOTACHAR;            othercase = NOTACHAR;
1730  #endif  #endif
# Line 1737  for (;;) Line 1749  for (;;)
1749        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1750    
1751        case OP_EXTUNI:        case OP_EXTUNI:
1752        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1753          {          {
1754          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1755          int ncount = 0;          int ncount = 0;
# Line 1745  for (;;) Line 1757  for (;;)
1757            {            {
1758            int nclen = 1;            int nclen = 1;
1759            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1760            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
1761            ncount++;            ncount++;
1762            nptr += nclen;            nptr += nclen;
1763            }            }
# Line 1913  for (;;) Line 1925  for (;;)
1925            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1926              {              {
1927  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1928              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1929  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1930              }              }
1931            else            else
# Line 1951  for (;;) Line 1963  for (;;)
1963            if (utf8 && d >= 128)            if (utf8 && d >= 128)
1964              {              {
1965  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1966              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
1967  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1968              }              }
1969            else            else
# Line 1987  for (;;) Line 1999  for (;;)
1999            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2000              {              {
2001  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2002              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2003  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2004              }              }
2005            else            else
# Line 2019  for (;;) Line 2031  for (;;)
2031            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2032              {              {
2033  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2034              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2035  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2036              }              }
2037            else            else
# Line 2054  for (;;) Line 2066  for (;;)
2066            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2067              {              {
2068  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2069              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2070  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2071              }              }
2072            else            else
# Line 2162  for (;;) Line 2174  for (;;)
2174    
2175  /* ========================================================================== */  /* ========================================================================== */
2176        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2177        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2178          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2179          though the other "backtracking verbs" are not supported. */
2180    
2181          case OP_FAIL:
2182          forced_fail++;    /* Count FAILs for multiple states */
2183          break;
2184    
2185        case OP_ASSERT:        case OP_ASSERT:
2186        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2200  for (;;) Line 2218  for (;;)
2218          {          {
2219          int local_offsets[1000];          int local_offsets[1000];
2220          int local_workspace[1000];          int local_workspace[1000];
2221          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2222            int condcode;
2223    
2224            /* Because of the way auto-callout works during compile, a callout item
2225            is inserted between OP_COND and an assertion condition. This does not
2226            happen for the other conditions. */
2227    
2228            if (code[LINK_SIZE+1] == OP_CALLOUT)
2229              {
2230              rrc = 0;
2231              if (pcre_callout != NULL)
2232                {
2233                pcre_callout_block cb;
2234                cb.version          = 1;   /* Version 1 of the callout block */
2235                cb.callout_number   = code[LINK_SIZE+2];
2236                cb.offset_vector    = offsets;
2237                cb.subject          = (PCRE_SPTR)start_subject;
2238                cb.subject_length   = end_subject - start_subject;
2239                cb.start_match      = current_subject - start_subject;
2240                cb.current_position = ptr - start_subject;
2241                cb.pattern_position = GET(code, LINK_SIZE + 3);
2242                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2243                cb.capture_top      = 1;
2244                cb.capture_last     = -1;
2245                cb.callout_data     = md->callout_data;
2246                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2247                }
2248              if (rrc > 0) break;                      /* Fail this thread */
2249              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2250              }
2251    
2252            condcode = code[LINK_SIZE+1];
2253    
2254          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2255    
# Line 2209  for (;;) Line 2258  for (;;)
2258          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2259    
2260          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2261            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2262    
2263          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2264          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
# Line 2221  for (;;) Line 2268  for (;;)
2268            {            {
2269            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2270            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2271            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (recursing > 0)
2272              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2273              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2274            }            }
2275    
2276          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2252  for (;;) Line 2300  for (;;)
2300                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2301              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2302            else            else
2303              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304            }            }
2305          }          }
2306        break;        break;
# Line 2404  for (;;) Line 2452  for (;;)
2452        /* Handle callouts */        /* Handle callouts */
2453    
2454        case OP_CALLOUT:        case OP_CALLOUT:
2455          rrc = 0;
2456        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2457          {          {
         int rrc;  
2458          pcre_callout_block cb;          pcre_callout_block cb;
2459          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2460          cb.callout_number   = code[1];          cb.callout_number   = code[1];
# Line 2421  for (;;) Line 2469  for (;;)
2469          cb.capture_last     = -1;          cb.capture_last     = -1;
2470          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2471          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2472          }          }
2473          if (rrc == 0)
2474            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2475        break;        break;
2476    
2477    
# Line 2438  for (;;) Line 2487  for (;;)
2487    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2488    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2489    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2490    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions. The "forced_
2491      fail" variable counts the number of (*F) encountered for the character. If it
2492      is equal to the original active_count (saved in workspace[1]) it means that
2493      (*F) was found on every active state. In this case we don't want to give a
2494      partial match. */
2495    
2496    if (new_count <= 0)    if (new_count <= 0)
2497      {      {
2498      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2499          rlevel == 1 &&                         /* Top level match function */          reached_end != workspace[1] &&               /* Not all reached end */
2500          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2501          ptr >= end_subject &&                  /* Reached end of subject */          (                                            /* either... */
2502          ptr > current_subject)                 /* Matched non-empty string */          (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2503            ||                                           /* or... */
2504            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2505             match_count < 0)                            /* no matches */
2506            ) &&                                         /* And... */
2507            ptr >= end_subject &&                     /* Reached end of subject */
2508            ptr > current_subject)                    /* Matched non-empty string */
2509        {        {
2510        if (offsetcount >= 2)        if (offsetcount >= 2)
2511          {          {
2512          offsets[0] = current_subject - start_subject;          offsets[0] = md->start_used_ptr - start_subject;
2513          offsets[1] = end_subject - start_subject;          offsets[1] = end_subject - start_subject;
2514          }          }
2515        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
# Line 2505  Returns: > 0 => number of match Line 2564  Returns: > 0 => number of match
2564                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2565  */  */
2566    
2567  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2568  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2569    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
2570    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2614  switch ((((options & PCRE_NEWLINE_BITS) Line 2673  switch ((((options & PCRE_NEWLINE_BITS)
2673           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
2674    {    {
2675    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
2676    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2677    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2678    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
2679         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2680    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
2681    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2682    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2713  if ((re->flags & PCRE_REQCHSET) != 0) Line 2772  if ((re->flags & PCRE_REQCHSET) != 0)
2772    }    }
2773    
2774  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
2775  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
2776  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
2777    
2778  for (;;)  for (;;)
2779    {    {
# Line 2725  for (;;) Line 2783  for (;;)
2783      {      {
2784      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
2785    
2786      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
2787      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
2788      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
2789      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
2790    
2791      if (firstline)      if (firstline)
2792        {        {
2793        const uschar *t = current_subject;        USPTR t = current_subject;
2794    #ifdef SUPPORT_UTF8
2795          if (utf8)
2796            {
2797            while (t < md->end_subject && !IS_NEWLINE(t))
2798              {
2799              t++;
2800              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2801              }
2802            }
2803          else
2804    #endif
2805        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2806        end_subject = t;        end_subject = t;
2807        }        }
2808    
2809      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
2810        starting point is not found, or if a known later character is not present.
2811        However, there is an option that disables these, for testing and for
2812        ensuring that all callouts do actually occur. */
2813    
2814        if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2815        {        {
       if (first_byte_caseless)  
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
2816    
2817      /* Or to just after a linebreak for a multiline match if possible */        /* Advance to a known first byte. */
2818    
2819      else if (startline)        if (first_byte >= 0)
2820        {          {
2821        if (current_subject > md->start_subject + start_offset)          if (first_byte_caseless)
2822              while (current_subject < end_subject &&
2823                     lcc[*current_subject] != first_byte)
2824                current_subject++;
2825            else
2826              while (current_subject < end_subject &&
2827                     *current_subject != first_byte)
2828                current_subject++;
2829            }
2830    
2831          /* Or to just after a linebreak for a multiline match if possible */
2832    
2833          else if (startline)
2834          {          {
2835          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
2836            current_subject++;            {
2837    #ifdef SUPPORT_UTF8
2838              if (utf8)
2839                {
2840                while (current_subject < end_subject &&
2841                       !WAS_NEWLINE(current_subject))
2842                  {
2843                  current_subject++;
2844                  while(current_subject < end_subject &&
2845                        (*current_subject & 0xc0) == 0x80)
2846                    current_subject++;
2847                  }
2848                }
2849              else
2850    #endif
2851              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2852                current_subject++;
2853    
2854              /* If we have just passed a CR and the newline option is ANY or
2855              ANYCRLF, and we are now at a LF, advance the match position by one
2856              more character. */
2857    
2858          /* If we have just passed a CR and the newline option is ANY or            if (current_subject[-1] == CHAR_CR &&
2859          ANYCRLF, and we are now at a LF, advance the match position by one more                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2860          character. */                 current_subject < end_subject &&
2861                   *current_subject == CHAR_NL)
2862          if (current_subject[-1] == '\r' &&              current_subject++;
2863               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&            }
              current_subject < end_subject &&  
              *current_subject == '\n')  
           current_subject++;  
2864          }          }
       }  
2865    
2866      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
2867    
2868      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
2869          {          {
2870          register unsigned int c = *current_subject;          while (current_subject < end_subject)
2871          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
2872            else break;            register unsigned int c = *current_subject;
2873              if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2874                else break;
2875              }
2876          }          }
2877        }        }
2878    
# Line 2800  for (;;) Line 2894  for (;;)
2894    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2895    don't do this when the string is sufficiently long.    don't do this when the string is sufficiently long.
2896    
2897    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested, and can
2898    */    also be explicitly deactivated. Furthermore, we have to disable when
2899      restarting after a partial match, because the required character may have
2900      already been matched. */
2901    
2902    if (req_byte >= 0 &&    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2903          req_byte >= 0 &&
2904        end_subject - current_subject < REQ_BYTE_MAX &&        end_subject - current_subject < REQ_BYTE_MAX &&
2905        (options & PCRE_PARTIAL) == 0)        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2906      {      {
2907      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);      register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2908    
# Line 2845  for (;;) Line 2942  for (;;)
2942    
2943    /* OK, now we can do the business */    /* OK, now we can do the business */
2944    
2945      md->start_used_ptr = current_subject;
2946    
2947    rc = internal_dfa_exec(    rc = internal_dfa_exec(
2948      md,                                /* fixed match data */      md,                                /* fixed match data */
2949      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2879  for (;;) Line 2978  for (;;)
2978    not contain any explicit matches for \r or \n, and the newline option is CRLF    not contain any explicit matches for \r or \n, and the newline option is CRLF
2979    or ANY or ANYCRLF, advance the match position by one more character. */    or ANY or ANYCRLF, advance the match position by one more character. */
2980    
2981    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
2982        current_subject < end_subject &&        current_subject < end_subject &&
2983        *current_subject == '\n' &&        *current_subject == CHAR_NL &&
2984        (re->flags & PCRE_HASCRORLF) == 0 &&        (re->flags & PCRE_HASCRORLF) == 0 &&
2985          (md->nltype == NLTYPE_ANY ||          (md->nltype == NLTYPE_ANY ||
2986           md->nltype == NLTYPE_ANYCRLF ||           md->nltype == NLTYPE_ANYCRLF ||

Legend:
Removed from v.327  
changed lines
  Added in v.435

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12