| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2005 University of Cambridge |
Copyright (c) 1997-2006 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 43 |
compatible, but it has advantages in certain applications. */ |
compatible, but it has advantages in certain applications. */ |
| 44 |
|
|
| 45 |
|
|
| 46 |
|
#define NLBLOCK md /* The block containing newline information */ |
| 47 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
| 48 |
|
|
| 49 |
|
|
| 289 |
const uschar *end_subject = md->end_subject; |
const uschar *end_subject = md->end_subject; |
| 290 |
const uschar *start_code = md->start_code; |
const uschar *start_code = md->start_code; |
| 291 |
|
|
| 292 |
|
#ifdef SUPPORT_UTF8 |
| 293 |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
| 294 |
|
#endif |
| 295 |
|
|
| 296 |
rlevel++; |
rlevel++; |
| 297 |
offsetcount &= (-2); |
offsetcount &= (-2); |
| 424 |
for (;;) |
for (;;) |
| 425 |
{ |
{ |
| 426 |
int i, j; |
int i, j; |
| 427 |
int c, d, clen, dlen; |
int clen, dlen; |
| 428 |
|
unsigned int c, d; |
| 429 |
|
|
| 430 |
/* Make the new state list into the active state list and empty the |
/* Make the new state list into the active state list and empty the |
| 431 |
new state list. */ |
new state list. */ |
| 484 |
const uschar *code; |
const uschar *code; |
| 485 |
int state_offset = current_state->offset; |
int state_offset = current_state->offset; |
| 486 |
int count, codevalue; |
int count, codevalue; |
| 487 |
int chartype, othercase; |
int chartype, script; |
| 488 |
|
|
| 489 |
#ifdef DEBUG |
#ifdef DEBUG |
| 490 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
| 649 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 650 |
case OP_CIRC: |
case OP_CIRC: |
| 651 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
| 652 |
((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE)) |
((ims & PCRE_MULTILINE) != 0 && |
| 653 |
|
ptr >= start_subject + md->nllen && |
| 654 |
|
ptr != end_subject && |
| 655 |
|
IS_NEWLINE(ptr - md->nllen))) |
| 656 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 657 |
break; |
break; |
| 658 |
|
|
| 686 |
|
|
| 687 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 688 |
case OP_ANY: |
case OP_ANY: |
| 689 |
if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0)) |
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || |
| 690 |
|
ptr > end_subject - md->nllen || |
| 691 |
|
!IS_NEWLINE(ptr))) |
| 692 |
{ ADD_NEW(state_offset + 1, 0); } |
{ ADD_NEW(state_offset + 1, 0); } |
| 693 |
break; |
break; |
| 694 |
|
|
| 695 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 696 |
case OP_EODN: |
case OP_EODN: |
| 697 |
if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject)) |
if (clen == 0 || |
| 698 |
|
(ptr == end_subject - md->nllen && IS_NEWLINE(ptr))) |
| 699 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 700 |
break; |
break; |
| 701 |
|
|
| 703 |
case OP_DOLL: |
case OP_DOLL: |
| 704 |
if ((md->moptions & PCRE_NOTEOL) == 0) |
if ((md->moptions & PCRE_NOTEOL) == 0) |
| 705 |
{ |
{ |
| 706 |
if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject || |
if (clen == 0 || |
| 707 |
(ims & PCRE_MULTILINE) != 0))) |
(ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) && |
| 708 |
|
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
| 709 |
|
)) |
| 710 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 711 |
} |
} |
| 712 |
else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0) |
else if ((ims & PCRE_MULTILINE) != 0 && |
| 713 |
|
ptr <= end_subject - md->nllen && IS_NEWLINE(ptr)) |
| 714 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 715 |
break; |
break; |
| 716 |
|
|
| 770 |
case OP_NOTPROP: |
case OP_NOTPROP: |
| 771 |
if (clen > 0) |
if (clen > 0) |
| 772 |
{ |
{ |
| 773 |
int rqdtype, category; |
BOOL OK; |
| 774 |
category = ucp_findchar(c, &chartype, &othercase); |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
| 775 |
rqdtype = code[1]; |
switch(code[1]) |
|
if (rqdtype >= 128) |
|
|
{ |
|
|
if ((rqdtype - 128 == category) == (codevalue == OP_PROP)) |
|
|
{ ADD_NEW(state_offset + 2, 0); } |
|
|
} |
|
|
else |
|
| 776 |
{ |
{ |
| 777 |
if ((rqdtype == chartype) == (codevalue == OP_PROP)) |
case PT_ANY: |
| 778 |
{ ADD_NEW(state_offset + 2, 0); } |
OK = TRUE; |
| 779 |
|
break; |
| 780 |
|
|
| 781 |
|
case PT_LAMP: |
| 782 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
| 783 |
|
break; |
| 784 |
|
|
| 785 |
|
case PT_GC: |
| 786 |
|
OK = category == code[2]; |
| 787 |
|
break; |
| 788 |
|
|
| 789 |
|
case PT_PC: |
| 790 |
|
OK = chartype == code[2]; |
| 791 |
|
break; |
| 792 |
|
|
| 793 |
|
case PT_SC: |
| 794 |
|
OK = script == code[2]; |
| 795 |
|
break; |
| 796 |
|
|
| 797 |
|
/* Should never occur, but keep compilers from grumbling. */ |
| 798 |
|
|
| 799 |
|
default: |
| 800 |
|
OK = codevalue != OP_PROP; |
| 801 |
|
break; |
| 802 |
} |
} |
| 803 |
|
|
| 804 |
|
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } |
| 805 |
} |
} |
| 806 |
break; |
break; |
| 807 |
#endif |
#endif |
| 822 |
{ |
{ |
| 823 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 824 |
(c < 256 && |
(c < 256 && |
| 825 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 826 |
|
(ims & PCRE_DOTALL) != 0 || |
| 827 |
|
ptr > end_subject - md->nllen || |
| 828 |
|
!IS_NEWLINE(ptr) |
| 829 |
|
) && |
| 830 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 831 |
{ |
{ |
| 832 |
count++; |
count++; |
| 843 |
{ |
{ |
| 844 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 845 |
(c < 256 && |
(c < 256 && |
| 846 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 847 |
|
(ims & PCRE_DOTALL) != 0 || |
| 848 |
|
ptr > end_subject - md->nllen || |
| 849 |
|
!IS_NEWLINE(ptr) |
| 850 |
|
) && |
| 851 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 852 |
{ |
{ |
| 853 |
ADD_NEW(state_offset + 2, 0); |
ADD_NEW(state_offset + 2, 0); |
| 863 |
{ |
{ |
| 864 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 865 |
(c < 256 && |
(c < 256 && |
| 866 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 867 |
|
(ims & PCRE_DOTALL) != 0 || |
| 868 |
|
ptr > end_subject - md->nllen || |
| 869 |
|
!IS_NEWLINE(ptr) |
| 870 |
|
) && |
| 871 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 872 |
{ |
{ |
| 873 |
ADD_NEW(state_offset, 0); |
ADD_NEW(state_offset, 0); |
| 886 |
{ |
{ |
| 887 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 888 |
(c < 256 && |
(c < 256 && |
| 889 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 890 |
|
(ims & PCRE_DOTALL) != 0 || |
| 891 |
|
ptr > end_subject - md->nllen || |
| 892 |
|
!IS_NEWLINE(ptr) |
| 893 |
|
) && |
| 894 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 895 |
{ |
{ |
| 896 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
| 910 |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
| 911 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
| 912 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
| 913 |
if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } |
| 914 |
if (clen > 0) |
if (clen > 0) |
| 915 |
{ |
{ |
| 916 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
| 917 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
| 918 |
if ((d == OP_PROP) == |
switch(code[2]) |
| 919 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
| 920 |
{ count++; ADD_NEW(state_offset, count); } |
case PT_ANY: |
| 921 |
|
OK = TRUE; |
| 922 |
|
break; |
| 923 |
|
|
| 924 |
|
case PT_LAMP: |
| 925 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
| 926 |
|
break; |
| 927 |
|
|
| 928 |
|
case PT_GC: |
| 929 |
|
OK = category == code[3]; |
| 930 |
|
break; |
| 931 |
|
|
| 932 |
|
case PT_PC: |
| 933 |
|
OK = chartype == code[3]; |
| 934 |
|
break; |
| 935 |
|
|
| 936 |
|
case PT_SC: |
| 937 |
|
OK = script == code[3]; |
| 938 |
|
break; |
| 939 |
|
|
| 940 |
|
/* Should never occur, but keep compilers from grumbling. */ |
| 941 |
|
|
| 942 |
|
default: |
| 943 |
|
OK = codevalue != OP_PROP; |
| 944 |
|
break; |
| 945 |
|
} |
| 946 |
|
|
| 947 |
|
if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); } |
| 948 |
} |
} |
| 949 |
break; |
break; |
| 950 |
|
|
| 953 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
| 954 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
| 955 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
| 956 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
| 957 |
{ |
{ |
| 958 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
| 959 |
int ncount = 0; |
int ncount = 0; |
| 962 |
int nd; |
int nd; |
| 963 |
int ndlen = 1; |
int ndlen = 1; |
| 964 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
| 965 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
| 966 |
ncount++; |
ncount++; |
| 967 |
nptr += ndlen; |
nptr += ndlen; |
| 968 |
} |
} |
| 974 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 975 |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
| 976 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
| 977 |
count = 3; |
count = 4; |
| 978 |
goto QS1; |
goto QS1; |
| 979 |
|
|
| 980 |
case OP_PROP_EXTRA + OP_TYPESTAR: |
case OP_PROP_EXTRA + OP_TYPESTAR: |
| 983 |
|
|
| 984 |
QS1: |
QS1: |
| 985 |
|
|
| 986 |
ADD_ACTIVE(state_offset + 3, 0); |
ADD_ACTIVE(state_offset + 4, 0); |
| 987 |
if (clen > 0) |
if (clen > 0) |
| 988 |
{ |
{ |
| 989 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
| 990 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
| 991 |
if ((d == OP_PROP) == |
switch(code[2]) |
| 992 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
| 993 |
{ ADD_NEW(state_offset + count, 0); } |
case PT_ANY: |
| 994 |
|
OK = TRUE; |
| 995 |
|
break; |
| 996 |
|
|
| 997 |
|
case PT_LAMP: |
| 998 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
| 999 |
|
break; |
| 1000 |
|
|
| 1001 |
|
case PT_GC: |
| 1002 |
|
OK = category == code[3]; |
| 1003 |
|
break; |
| 1004 |
|
|
| 1005 |
|
case PT_PC: |
| 1006 |
|
OK = chartype == code[3]; |
| 1007 |
|
break; |
| 1008 |
|
|
| 1009 |
|
case PT_SC: |
| 1010 |
|
OK = script == code[3]; |
| 1011 |
|
break; |
| 1012 |
|
|
| 1013 |
|
/* Should never occur, but keep compilers from grumbling. */ |
| 1014 |
|
|
| 1015 |
|
default: |
| 1016 |
|
OK = codevalue != OP_PROP; |
| 1017 |
|
break; |
| 1018 |
|
} |
| 1019 |
|
|
| 1020 |
|
if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); } |
| 1021 |
} |
} |
| 1022 |
break; |
break; |
| 1023 |
|
|
| 1034 |
QS2: |
QS2: |
| 1035 |
|
|
| 1036 |
ADD_ACTIVE(state_offset + 2, 0); |
ADD_ACTIVE(state_offset + 2, 0); |
| 1037 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
| 1038 |
{ |
{ |
| 1039 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
| 1040 |
int ncount = 0; |
int ncount = 0; |
| 1043 |
int nd; |
int nd; |
| 1044 |
int ndlen = 1; |
int ndlen = 1; |
| 1045 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
| 1046 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
| 1047 |
ncount++; |
ncount++; |
| 1048 |
nptr += ndlen; |
nptr += ndlen; |
| 1049 |
} |
} |
| 1056 |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
| 1057 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
| 1058 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
| 1059 |
{ ADD_ACTIVE(state_offset + 5, 0); } |
{ ADD_ACTIVE(state_offset + 6, 0); } |
| 1060 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
| 1061 |
if (clen > 0) |
if (clen > 0) |
| 1062 |
{ |
{ |
| 1063 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
| 1064 |
int rqdtype = code[4]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
| 1065 |
if ((d == OP_PROP) == |
switch(code[4]) |
| 1066 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
| 1067 |
|
case PT_ANY: |
| 1068 |
|
OK = TRUE; |
| 1069 |
|
break; |
| 1070 |
|
|
| 1071 |
|
case PT_LAMP: |
| 1072 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
| 1073 |
|
break; |
| 1074 |
|
|
| 1075 |
|
case PT_GC: |
| 1076 |
|
OK = category == code[5]; |
| 1077 |
|
break; |
| 1078 |
|
|
| 1079 |
|
case PT_PC: |
| 1080 |
|
OK = chartype == code[5]; |
| 1081 |
|
break; |
| 1082 |
|
|
| 1083 |
|
case PT_SC: |
| 1084 |
|
OK = script == code[5]; |
| 1085 |
|
break; |
| 1086 |
|
|
| 1087 |
|
/* Should never occur, but keep compilers from grumbling. */ |
| 1088 |
|
|
| 1089 |
|
default: |
| 1090 |
|
OK = codevalue != OP_PROP; |
| 1091 |
|
break; |
| 1092 |
|
} |
| 1093 |
|
|
| 1094 |
|
if (OK == (d == OP_PROP)) |
| 1095 |
{ |
{ |
| 1096 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
| 1097 |
{ ADD_NEW(state_offset + 5, 0); } |
{ ADD_NEW(state_offset + 6, 0); } |
| 1098 |
else |
else |
| 1099 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
| 1100 |
} |
} |
| 1108 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
| 1109 |
{ ADD_ACTIVE(state_offset + 4, 0); } |
{ ADD_ACTIVE(state_offset + 4, 0); } |
| 1110 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
| 1111 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
| 1112 |
{ |
{ |
| 1113 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
| 1114 |
int ncount = 0; |
int ncount = 0; |
| 1117 |
int nd; |
int nd; |
| 1118 |
int ndlen = 1; |
int ndlen = 1; |
| 1119 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
| 1120 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
| 1121 |
ncount++; |
ncount++; |
| 1122 |
nptr += ndlen; |
nptr += ndlen; |
| 1123 |
} |
} |
| 1148 |
{ |
{ |
| 1149 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
| 1150 |
{ |
{ |
| 1151 |
|
int othercase; |
| 1152 |
if (c < 128) othercase = fcc[c]; else |
if (c < 128) othercase = fcc[c]; else |
| 1153 |
|
|
| 1154 |
/* If we have Unicode property support, we can use it to test the |
/* If we have Unicode property support, we can use it to test the |
| 1155 |
other case of the character, if there is one. The result of |
other case of the character. */ |
|
ucp_findchar() is < 0 if the char isn't found, and othercase is |
|
|
returned as zero if there isn't another case. */ |
|
| 1156 |
|
|
| 1157 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 1158 |
if (ucp_findchar(c, &chartype, &othercase) < 0) |
othercase = _pcre_ucp_othercase(c); |
| 1159 |
|
#else |
| 1160 |
|
othercase = -1; |
| 1161 |
#endif |
#endif |
|
othercase = -1; |
|
| 1162 |
|
|
| 1163 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
| 1164 |
} |
} |
| 1180 |
to wait for them to pass before continuing. */ |
to wait for them to pass before continuing. */ |
| 1181 |
|
|
| 1182 |
case OP_EXTUNI: |
case OP_EXTUNI: |
| 1183 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
| 1184 |
{ |
{ |
| 1185 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
| 1186 |
int ncount = 0; |
int ncount = 0; |
| 1188 |
{ |
{ |
| 1189 |
int nclen = 1; |
int nclen = 1; |
| 1190 |
GETCHARLEN(c, nptr, nclen); |
GETCHARLEN(c, nptr, nclen); |
| 1191 |
if (ucp_findchar(c, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; |
| 1192 |
ncount++; |
ncount++; |
| 1193 |
nptr += nclen; |
nptr += nclen; |
| 1194 |
} |
} |
| 1223 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1224 |
{ |
{ |
| 1225 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1226 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
| 1227 |
{ |
{ |
| 1228 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 1229 |
if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
| 1230 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
| 1231 |
} |
} |
| 1232 |
else |
else |
| 1247 |
if (clen > 0) |
if (clen > 0) |
| 1248 |
{ |
{ |
| 1249 |
int otherd = -1; |
int otherd = -1; |
| 1250 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1251 |
{ |
{ |
| 1252 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1253 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
| 1254 |
{ |
{ |
| 1255 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 1256 |
if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
| 1257 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
| 1258 |
} |
} |
| 1259 |
else |
else |
| 1274 |
if (clen > 0) |
if (clen > 0) |
| 1275 |
{ |
{ |
| 1276 |
int otherd = -1; |
int otherd = -1; |
| 1277 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1278 |
{ |
{ |
| 1279 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1280 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
| 1281 |
{ |
{ |
| 1282 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 1283 |
if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
| 1284 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
| 1285 |
} |
} |
| 1286 |
else |
else |
| 1308 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1309 |
{ |
{ |
| 1310 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1311 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
| 1312 |
{ |
{ |
| 1313 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 1314 |
if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
| 1315 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
| 1316 |
} |
} |
| 1317 |
else |
else |
| 1397 |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
| 1398 |
if (isinclass) |
if (isinclass) |
| 1399 |
{ |
{ |
| 1400 |
if (++count >= GET2(ecode, 3)) |
int max = GET2(ecode, 3); |
| 1401 |
|
if (++count >= max && max != 0) /* Max 0 => no limit */ |
| 1402 |
{ ADD_NEW(next_state_offset + 5, 0); } |
{ ADD_NEW(next_state_offset + 5, 0); } |
| 1403 |
else |
else |
| 1404 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
| 1555 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1556 |
case OP_ONCE: |
case OP_ONCE: |
| 1557 |
{ |
{ |
|
const uschar *endcode; |
|
| 1558 |
int local_offsets[2]; |
int local_offsets[2]; |
| 1559 |
int local_workspace[1000]; |
int local_workspace[1000]; |
| 1560 |
|
|
| 1576 |
const uschar *end_subpattern = code; |
const uschar *end_subpattern = code; |
| 1577 |
int charcount = local_offsets[1] - local_offsets[0]; |
int charcount = local_offsets[1] - local_offsets[0]; |
| 1578 |
int next_state_offset, repeat_state_offset; |
int next_state_offset, repeat_state_offset; |
|
BOOL is_repeated; |
|
| 1579 |
|
|
| 1580 |
do { end_subpattern += GET(end_subpattern, 1); } |
do { end_subpattern += GET(end_subpattern, 1); } |
| 1581 |
while (*end_subpattern == OP_ALT); |
while (*end_subpattern == OP_ALT); |
| 1650 |
cb.version = 1; /* Version 1 of the callout block */ |
cb.version = 1; /* Version 1 of the callout block */ |
| 1651 |
cb.callout_number = code[1]; |
cb.callout_number = code[1]; |
| 1652 |
cb.offset_vector = offsets; |
cb.offset_vector = offsets; |
| 1653 |
cb.subject = (char *)start_subject; |
cb.subject = (PCRE_SPTR)start_subject; |
| 1654 |
cb.subject_length = end_subject - start_subject; |
cb.subject_length = end_subject - start_subject; |
| 1655 |
cb.start_match = current_subject - start_subject; |
cb.start_match = current_subject - start_subject; |
| 1656 |
cb.current_position = ptr - start_subject; |
cb.current_position = ptr - start_subject; |
| 1698 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
| 1699 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
| 1700 |
rlevel*2-2, SP)); |
rlevel*2-2, SP)); |
| 1701 |
return match_count; |
break; /* In effect, "return", but see the comment below */ |
| 1702 |
} |
} |
| 1703 |
|
|
| 1704 |
/* One or more states are active for the next character. */ |
/* One or more states are active for the next character. */ |
| 1706 |
ptr += clen; /* Advance to next subject character */ |
ptr += clen; /* Advance to next subject character */ |
| 1707 |
} /* Loop to move along the subject string */ |
} /* Loop to move along the subject string */ |
| 1708 |
|
|
| 1709 |
/* Control never gets here, but we must keep the compiler happy. */ |
/* Control gets here from "break" a few lines above. We do it this way because |
| 1710 |
|
if we use "return" above, we have compiler trouble. Some compilers warn if |
| 1711 |
|
there's nothing here because they think the function doesn't return a value. On |
| 1712 |
|
the other hand, if we put a dummy statement here, some more clever compilers |
| 1713 |
|
complain that it can't be reached. Sigh. */ |
| 1714 |
|
|
| 1715 |
DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n" |
return match_count; |
|
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP)); |
|
|
return PCRE_ERROR_NOMATCH; |
|
| 1716 |
} |
} |
| 1717 |
|
|
| 1718 |
|
|
| 1744 |
< -1 => some kind of unexpected problem |
< -1 => some kind of unexpected problem |
| 1745 |
*/ |
*/ |
| 1746 |
|
|
| 1747 |
EXPORT int |
PCRE_DATA_SCOPE int |
| 1748 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
| 1749 |
const char *subject, int length, int start_offset, int options, int *offsets, |
const char *subject, int length, int start_offset, int options, int *offsets, |
| 1750 |
int offsetcount, int *workspace, int wscount) |
int offsetcount, int *workspace, int wscount) |
| 1751 |
{ |
{ |
| 1752 |
real_pcre *re = (real_pcre *)argument_re; |
real_pcre *re = (real_pcre *)argument_re; |
| 1753 |
dfa_match_data match_block; |
dfa_match_data match_block; |
| 1754 |
|
dfa_match_data *md = &match_block; |
| 1755 |
BOOL utf8, anchored, startline, firstline; |
BOOL utf8, anchored, startline, firstline; |
| 1756 |
const uschar *current_subject, *end_subject, *lcc; |
const uschar *current_subject, *end_subject, *lcc; |
| 1757 |
|
|
| 1766 |
int first_byte = -1; |
int first_byte = -1; |
| 1767 |
int req_byte = -1; |
int req_byte = -1; |
| 1768 |
int req_byte2 = -1; |
int req_byte2 = -1; |
| 1769 |
|
int newline; |
| 1770 |
|
|
| 1771 |
/* Plausibility checks */ |
/* Plausibility checks */ |
| 1772 |
|
|
| 1781 |
match block, so we must initialize them beforehand. However, the other fields |
match block, so we must initialize them beforehand. However, the other fields |
| 1782 |
in the match block must not be set until after the byte flipping. */ |
in the match block must not be set until after the byte flipping. */ |
| 1783 |
|
|
| 1784 |
match_block.tables = re->tables; |
md->tables = re->tables; |
| 1785 |
match_block.callout_data = NULL; |
md->callout_data = NULL; |
| 1786 |
|
|
| 1787 |
if (extra_data != NULL) |
if (extra_data != NULL) |
| 1788 |
{ |
{ |
| 1790 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
| 1791 |
study = (const pcre_study_data *)extra_data->study_data; |
study = (const pcre_study_data *)extra_data->study_data; |
| 1792 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
| 1793 |
|
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
| 1794 |
|
return PCRE_ERROR_DFA_UMLIMIT; |
| 1795 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
| 1796 |
match_block.callout_data = extra_data->callout_data; |
md->callout_data = extra_data->callout_data; |
| 1797 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
| 1798 |
match_block.tables = extra_data->tables; |
md->tables = extra_data->tables; |
| 1799 |
} |
} |
| 1800 |
|
|
| 1801 |
/* Check that the first field in the block is the magic number. If it is not, |
/* Check that the first field in the block is the magic number. If it is not, |
| 1816 |
end_subject = (const unsigned char *)subject + length; |
end_subject = (const unsigned char *)subject + length; |
| 1817 |
req_byte_ptr = current_subject - 1; |
req_byte_ptr = current_subject - 1; |
| 1818 |
|
|
| 1819 |
|
#ifdef SUPPORT_UTF8 |
| 1820 |
utf8 = (re->options & PCRE_UTF8) != 0; |
utf8 = (re->options & PCRE_UTF8) != 0; |
| 1821 |
anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0; |
#else |
| 1822 |
|
utf8 = FALSE; |
| 1823 |
|
#endif |
| 1824 |
|
|
| 1825 |
|
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
| 1826 |
|
(re->options & PCRE_ANCHORED) != 0; |
| 1827 |
|
|
| 1828 |
/* The remaining fixed data for passing around. */ |
/* The remaining fixed data for passing around. */ |
| 1829 |
|
|
| 1830 |
match_block.start_code = (const uschar *)argument_re + |
md->start_code = (const uschar *)argument_re + |
| 1831 |
re->name_table_offset + re->name_count * re->name_entry_size; |
re->name_table_offset + re->name_count * re->name_entry_size; |
| 1832 |
match_block.start_subject = (const unsigned char *)subject; |
md->start_subject = (const unsigned char *)subject; |
| 1833 |
match_block.end_subject = end_subject; |
md->end_subject = end_subject; |
| 1834 |
match_block.moptions = options; |
md->moptions = options; |
| 1835 |
match_block.poptions = re->options; |
md->poptions = re->options; |
| 1836 |
|
|
| 1837 |
|
/* Handle different types of newline. The two bits give four cases. If nothing |
| 1838 |
|
is set at run time, whatever was used at compile time applies. */ |
| 1839 |
|
|
| 1840 |
|
switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & |
| 1841 |
|
PCRE_NEWLINE_CRLF) |
| 1842 |
|
{ |
| 1843 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
| 1844 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
| 1845 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
| 1846 |
|
case PCRE_NEWLINE_CR+ |
| 1847 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
| 1848 |
|
} |
| 1849 |
|
|
| 1850 |
|
if (newline > 255) |
| 1851 |
|
{ |
| 1852 |
|
md->nllen = 2; |
| 1853 |
|
md->nl[0] = (newline >> 8) & 255; |
| 1854 |
|
md->nl[1] = newline & 255; |
| 1855 |
|
} |
| 1856 |
|
else |
| 1857 |
|
{ |
| 1858 |
|
md->nllen = 1; |
| 1859 |
|
md->nl[0] = newline; |
| 1860 |
|
} |
| 1861 |
|
|
| 1862 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
| 1863 |
back the character offset. */ |
back the character offset. */ |
| 1883 |
is a feature that makes it possible to save compiled regex and re-use them |
is a feature that makes it possible to save compiled regex and re-use them |
| 1884 |
in other programs later. */ |
in other programs later. */ |
| 1885 |
|
|
| 1886 |
if (match_block.tables == NULL) match_block.tables = _pcre_default_tables; |
if (md->tables == NULL) md->tables = _pcre_default_tables; |
| 1887 |
|
|
| 1888 |
/* The lower casing table and the "must be at the start of a line" flag are |
/* The lower casing table and the "must be at the start of a line" flag are |
| 1889 |
used in a loop when finding where to start. */ |
used in a loop when finding where to start. */ |
| 1890 |
|
|
| 1891 |
lcc = match_block.tables + lcc_offset; |
lcc = md->tables + lcc_offset; |
| 1892 |
startline = (re->options & PCRE_STARTLINE) != 0; |
startline = (re->options & PCRE_STARTLINE) != 0; |
| 1893 |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
| 1894 |
|
|
| 1921 |
{ |
{ |
| 1922 |
req_byte = re->req_byte & 255; |
req_byte = re->req_byte & 255; |
| 1923 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
| 1924 |
req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */ |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
| 1925 |
} |
} |
| 1926 |
|
|
| 1927 |
/* Call the main matching function, looping for a non-anchored regex after a |
/* Call the main matching function, looping for a non-anchored regex after a |
| 1939 |
|
|
| 1940 |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
| 1941 |
start of the match is constrained to the first line of a multiline string. |
start of the match is constrained to the first line of a multiline string. |
| 1942 |
Implement this by temporarily adjusting end_subject so that we stop scanning |
Implement this by temporarily adjusting end_subject so that we stop |
| 1943 |
at a newline. If the match fails at the newline, later code breaks this loop. |
scanning at a newline. If the match fails at the newline, later code breaks |
| 1944 |
*/ |
this loop. */ |
| 1945 |
|
|
| 1946 |
if (firstline) |
if (firstline) |
| 1947 |
{ |
{ |
| 1948 |
const uschar *t = current_subject; |
const uschar *t = current_subject; |
| 1949 |
while (t < save_end_subject && *t != '\n') t++; |
while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; |
| 1950 |
end_subject = t; |
end_subject = t; |
| 1951 |
} |
} |
| 1952 |
|
|
| 1961 |
current_subject++; |
current_subject++; |
| 1962 |
} |
} |
| 1963 |
|
|
| 1964 |
/* Or to just after \n for a multiline match if possible */ |
/* Or to just after a linebreak for a multiline match if possible */ |
| 1965 |
|
|
| 1966 |
else if (startline) |
else if (startline) |
| 1967 |
{ |
{ |
| 1968 |
if (current_subject > match_block.start_subject + start_offset) |
if (current_subject > md->start_subject + md->nllen + |
| 1969 |
|
start_offset) |
| 1970 |
{ |
{ |
| 1971 |
while (current_subject < end_subject && current_subject[-1] != NEWLINE) |
while (current_subject <= end_subject && |
| 1972 |
|
!IS_NEWLINE(current_subject - md->nllen)) |
| 1973 |
current_subject++; |
current_subject++; |
| 1974 |
} |
} |
| 1975 |
} |
} |
| 2050 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
| 2051 |
|
|
| 2052 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
| 2053 |
&match_block, /* fixed match data */ |
md, /* fixed match data */ |
| 2054 |
match_block.start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |
| 2055 |
current_subject, /* where we currently are */ |
current_subject, /* where we currently are */ |
| 2056 |
start_offset, /* start offset in subject */ |
start_offset, /* start offset in subject */ |
| 2057 |
offsets, /* offset vector */ |
offsets, /* offset vector */ |
| 2058 |
offsetcount, /* size of same */ |
offsetcount, /* size of same */ |
| 2059 |
workspace, /* workspace vector */ |
workspace, /* workspace vector */ |
| 2060 |
wscount, /* size of same */ |
wscount, /* size of same */ |
| 2061 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
| 2062 |
0, /* function recurse level */ |
0, /* function recurse level */ |
| 2063 |
0); /* regex recurse level */ |
0); /* regex recurse level */ |
| 2064 |
|
|
| 2065 |
/* Anything other than "no match" means we are done, always; otherwise, carry |
/* Anything other than "no match" means we are done, always; otherwise, carry |
| 2066 |
on only if not anchored. */ |
on only if not anchored. */ |
| 2070 |
/* Advance to the next subject character unless we are at the end of a line |
/* Advance to the next subject character unless we are at the end of a line |
| 2071 |
and firstline is set. */ |
and firstline is set. */ |
| 2072 |
|
|
| 2073 |
if (firstline && *current_subject == NEWLINE) break; |
if (firstline && |
| 2074 |
|
current_subject <= end_subject - md->nllen && |
| 2075 |
|
IS_NEWLINE(current_subject)) break; |
| 2076 |
current_subject++; |
current_subject++; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
| 2077 |
if (utf8) |
if (utf8) |
| 2078 |
{ |
{ |
| 2079 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
| 2080 |
current_subject++; |
current_subject++; |
| 2081 |
} |
} |
|
#endif |
|
|
|
|
| 2082 |
if (current_subject > end_subject) break; |
if (current_subject > end_subject) break; |
| 2083 |
} |
} |
| 2084 |
|
|