/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 86 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2006 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 288  const uschar *start_subject = md->start_ Line 288  const uschar *start_subject = md->start_
288  const uschar *end_subject = md->end_subject;  const uschar *end_subject = md->end_subject;
289  const uschar *start_code = md->start_code;  const uschar *start_code = md->start_code;
290    
291    #ifdef SUPPORT_UTF8
292  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
293    #endif
294    
295  rlevel++;  rlevel++;
296  offsetcount &= (-2);  offsetcount &= (-2);
# Line 480  for (;;) Line 482  for (;;)
482      const uschar *code;      const uschar *code;
483      int state_offset = current_state->offset;      int state_offset = current_state->offset;
484      int count, codevalue;      int count, codevalue;
485      int chartype, othercase;      int chartype, script;
486    
487  #ifdef DEBUG  #ifdef DEBUG
488      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
# Line 757  for (;;) Line 759  for (;;)
759        case OP_NOTPROP:        case OP_NOTPROP:
760        if (clen > 0)        if (clen > 0)
761          {          {
762          int rqdtype, category;          BOOL OK;
763          category = _pcre_ucp_findchar(c, &chartype, &othercase);          int category = _pcre_ucp_findprop(c, &chartype, &script);
764          rqdtype = code[1];          switch(code[1])
         if (rqdtype >= 128)  
           {  
           if ((rqdtype - 128 == category) == (codevalue == OP_PROP))  
             { ADD_NEW(state_offset + 2, 0); }  
           }  
         else  
765            {            {
766            if ((rqdtype == chartype) == (codevalue == OP_PROP))            case PT_ANY:
767              { ADD_NEW(state_offset + 2, 0); }            OK = TRUE;
768              break;
769    
770              case PT_LAMP:
771              OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
772              break;
773    
774              case PT_GC:
775              OK = category == code[2];
776              break;
777    
778              case PT_PC:
779              OK = chartype == code[2];
780              break;
781    
782              case PT_SC:
783              OK = script == code[2];
784              break;
785    
786              /* Should never occur, but keep compilers from grumbling. */
787    
788              default:
789              OK = codevalue != OP_PROP;
790              break;
791            }            }
792    
793            if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
794          }          }
795        break;        break;
796  #endif  #endif
# Line 862  for (;;) Line 883  for (;;)
883        case OP_PROP_EXTRA + OP_TYPEPLUS:        case OP_PROP_EXTRA + OP_TYPEPLUS:
884        case OP_PROP_EXTRA + OP_TYPEMINPLUS:        case OP_PROP_EXTRA + OP_TYPEMINPLUS:
885        count = current_state->count;           /* Already matched */        count = current_state->count;           /* Already matched */
886        if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
887        if (clen > 0)        if (clen > 0)
888          {          {
889          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
890          int rqdtype = code[2];          int category = _pcre_ucp_findprop(c, &chartype, &script);
891          if ((d == OP_PROP) ==          switch(code[2])
892              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))            {
893            { count++; ADD_NEW(state_offset, count); }            case PT_ANY:
894              OK = TRUE;
895              break;
896    
897              case PT_LAMP:
898              OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
899              break;
900    
901              case PT_GC:
902              OK = category == code[3];
903              break;
904    
905              case PT_PC:
906              OK = chartype == code[3];
907              break;
908    
909              case PT_SC:
910              OK = script == code[3];
911              break;
912    
913              /* Should never occur, but keep compilers from grumbling. */
914    
915              default:
916              OK = codevalue != OP_PROP;
917              break;
918              }
919    
920            if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); }
921          }          }
922        break;        break;
923    
# Line 878  for (;;) Line 926  for (;;)
926        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
927        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
928        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
929        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
930          {          {
931          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
932          int ncount = 0;          int ncount = 0;
# Line 887  for (;;) Line 935  for (;;)
935            int nd;            int nd;
936            int ndlen = 1;            int ndlen = 1;
937            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
938            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
939            ncount++;            ncount++;
940            nptr += ndlen;            nptr += ndlen;
941            }            }
# Line 899  for (;;) Line 947  for (;;)
947        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
948        case OP_PROP_EXTRA + OP_TYPEQUERY:        case OP_PROP_EXTRA + OP_TYPEQUERY:
949        case OP_PROP_EXTRA + OP_TYPEMINQUERY:        case OP_PROP_EXTRA + OP_TYPEMINQUERY:
950        count = 3;        count = 4;
951        goto QS1;        goto QS1;
952    
953        case OP_PROP_EXTRA + OP_TYPESTAR:        case OP_PROP_EXTRA + OP_TYPESTAR:
# Line 908  for (;;) Line 956  for (;;)
956    
957        QS1:        QS1:
958    
959        ADD_ACTIVE(state_offset + 3, 0);        ADD_ACTIVE(state_offset + 4, 0);
960        if (clen > 0)        if (clen > 0)
961          {          {
962          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
963          int rqdtype = code[2];          int category = _pcre_ucp_findprop(c, &chartype, &script);
964          if ((d == OP_PROP) ==          switch(code[2])
965              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))            {
966            { ADD_NEW(state_offset + count, 0); }            case PT_ANY:
967              OK = TRUE;
968              break;
969    
970              case PT_LAMP:
971              OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
972              break;
973    
974              case PT_GC:
975              OK = category == code[3];
976              break;
977    
978              case PT_PC:
979              OK = chartype == code[3];
980              break;
981    
982              case PT_SC:
983              OK = script == code[3];
984              break;
985    
986              /* Should never occur, but keep compilers from grumbling. */
987    
988              default:
989              OK = codevalue != OP_PROP;
990              break;
991              }
992    
993            if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); }
994          }          }
995        break;        break;
996    
# Line 932  for (;;) Line 1007  for (;;)
1007        QS2:        QS2:
1008    
1009        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1010        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1011          {          {
1012          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1013          int ncount = 0;          int ncount = 0;
# Line 941  for (;;) Line 1016  for (;;)
1016            int nd;            int nd;
1017            int ndlen = 1;            int ndlen = 1;
1018            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1019            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1020            ncount++;            ncount++;
1021            nptr += ndlen;            nptr += ndlen;
1022            }            }
# Line 954  for (;;) Line 1029  for (;;)
1029        case OP_PROP_EXTRA + OP_TYPEUPTO:        case OP_PROP_EXTRA + OP_TYPEUPTO:
1030        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1031        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1032          { ADD_ACTIVE(state_offset + 5, 0); }          { ADD_ACTIVE(state_offset + 6, 0); }
1033        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1034        if (clen > 0)        if (clen > 0)
1035          {          {
1036          int category = _pcre_ucp_findchar(c, &chartype, &othercase);          BOOL OK;
1037          int rqdtype = code[4];          int category = _pcre_ucp_findprop(c, &chartype, &script);
1038          if ((d == OP_PROP) ==          switch(code[4])
1039              (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype)))            {
1040              case PT_ANY:
1041              OK = TRUE;
1042              break;
1043    
1044              case PT_LAMP:
1045              OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1046              break;
1047    
1048              case PT_GC:
1049              OK = category == code[5];
1050              break;
1051    
1052              case PT_PC:
1053              OK = chartype == code[5];
1054              break;
1055    
1056              case PT_SC:
1057              OK = script == code[5];
1058              break;
1059    
1060              /* Should never occur, but keep compilers from grumbling. */
1061    
1062              default:
1063              OK = codevalue != OP_PROP;
1064              break;
1065              }
1066    
1067            if (OK == (d == OP_PROP))
1068            {            {
1069            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1070              { ADD_NEW(state_offset + 5, 0); }              { ADD_NEW(state_offset + 6, 0); }
1071            else            else
1072              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1073            }            }
# Line 978  for (;;) Line 1081  for (;;)
1081        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1082          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1083        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1084        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1085          {          {
1086          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1087          int ncount = 0;          int ncount = 0;
# Line 987  for (;;) Line 1090  for (;;)
1090            int nd;            int nd;
1091            int ndlen = 1;            int ndlen = 1;
1092            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1093            if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break;            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1094            ncount++;            ncount++;
1095            nptr += ndlen;            nptr += ndlen;
1096            }            }
# Line 1018  for (;;) Line 1121  for (;;)
1121          {          {
1122          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1123            {            {
1124              int othercase;
1125            if (c < 128) othercase = fcc[c]; else            if (c < 128) othercase = fcc[c]; else
1126    
1127            /* If we have Unicode property support, we can use it to test the            /* If we have Unicode property support, we can use it to test the
1128            other case of the character, if there is one. The result of            other case of the character. */
           _pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is  
           returned as zero if there isn't another case. */  
1129    
1130  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1131            if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0)            othercase = _pcre_ucp_othercase(c);
1132    #else
1133              othercase = -1;
1134  #endif  #endif
             othercase = -1;  
1135    
1136            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1137            }            }
# Line 1050  for (;;) Line 1153  for (;;)
1153        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
1154    
1155        case OP_EXTUNI:        case OP_EXTUNI:
1156        if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M)        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1157          {          {
1158          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1159          int ncount = 0;          int ncount = 0;
# Line 1058  for (;;) Line 1161  for (;;)
1161            {            {
1162            int nclen = 1;            int nclen = 1;
1163            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
1164            if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break;            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1165            ncount++;            ncount++;
1166            nptr += nclen;            nptr += nclen;
1167            }            }
# Line 1093  for (;;) Line 1196  for (;;)
1196          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1197            {            {
1198  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1199            if (utf8 && c >= 128)            if (utf8 && d >= 128)
1200              {              {
1201  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1202              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = _pcre_ucp_othercase(d);
1203  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1204              }              }
1205            else            else
# Line 1120  for (;;) Line 1223  for (;;)
1223          if ((ims && PCRE_CASELESS) != 0)          if ((ims && PCRE_CASELESS) != 0)
1224            {            {
1225  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1226            if (utf8 && c >= 128)            if (utf8 && d >= 128)
1227              {              {
1228  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1229              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = _pcre_ucp_othercase(d);
1230  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1231              }              }
1232            else            else
# Line 1147  for (;;) Line 1250  for (;;)
1250          if ((ims && PCRE_CASELESS) != 0)          if ((ims && PCRE_CASELESS) != 0)
1251            {            {
1252  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1253            if (utf8 && c >= 128)            if (utf8 && d >= 128)
1254              {              {
1255  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1256              if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1;              otherd = _pcre_ucp_othercase(d);
1257  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1258              }              }
1259            else            else
# Line 1178  for (;;) Line 1281  for (;;)
1281          if ((ims & PCRE_CASELESS) != 0)          if ((ims & PCRE_CASELESS) != 0)
1282            {            {
1283  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1284            if (utf8 && c >= 128)            if (utf8 && d >= 128)
1285              {              {
1286  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1287              if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1;              otherd = _pcre_ucp_othercase(d);
1288  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1289              }              }
1290            else            else
# Line 1519  for (;;) Line 1622  for (;;)
1622          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
1623          cb.callout_number   = code[1];          cb.callout_number   = code[1];
1624          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
1625          cb.subject          = (char *)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
1626          cb.subject_length   = end_subject - start_subject;          cb.subject_length   = end_subject - start_subject;
1627          cb.start_match      = current_subject - start_subject;          cb.start_match      = current_subject - start_subject;
1628          cb.current_position = ptr - start_subject;          cb.current_position = ptr - start_subject;
# Line 1611  Returns: > 0 => number of match Line 1714  Returns: > 0 => number of match
1714                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
1715  */  */
1716    
1717  PCRE_EXPORT int  PCRE_DATA_SCOPE int
1718  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1719    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
1720    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 1655  if (extra_data != NULL) Line 1758  if (extra_data != NULL)
1758    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)    if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1759      study = (const pcre_study_data *)extra_data->study_data;      study = (const pcre_study_data *)extra_data->study_data;
1760    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;    if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1761      if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
1762        return PCRE_ERROR_DFA_UMLIMIT;
1763    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)    if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1764      match_block.callout_data = extra_data->callout_data;      match_block.callout_data = extra_data->callout_data;
1765    if ((flags & PCRE_EXTRA_TABLES) != 0)    if ((flags & PCRE_EXTRA_TABLES) != 0)
# Line 1680  end_subject = (const unsigned char *)sub Line 1785  end_subject = (const unsigned char *)sub
1785  req_byte_ptr = current_subject - 1;  req_byte_ptr = current_subject - 1;
1786    
1787  utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = (re->options & PCRE_UTF8) != 0;
1788  anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0;  
1789    anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
1790      (re->options & PCRE_ANCHORED) != 0;
1791    
1792  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
1793    
# Line 1771  for (;;) Line 1878  for (;;)
1878    
1879      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* Advance to a unique first char if possible. If firstline is TRUE, the
1880      start of the match is constrained to the first line of a multiline string.      start of the match is constrained to the first line of a multiline string.
1881      Implement this by temporarily adjusting end_subject so that we stop scanning      Implement this by temporarily adjusting end_subject so that we stop
1882      at a newline. If the match fails at the newline, later code breaks this loop.      scanning at a newline. If the match fails at the newline, later code breaks
1883      */      this loop. */
1884    
1885      if (firstline)      if (firstline)
1886        {        {

Legend:
Removed from v.86  
changed lines
  Added in v.87

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12