| 42 |
supporting internal functions that are not used by other modules. */ |
supporting internal functions that are not used by other modules. */ |
| 43 |
|
|
| 44 |
|
|
| 45 |
|
#define NLBLOCK cd /* The block containing newline information */ |
| 46 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
| 47 |
|
|
| 48 |
|
|
| 191 |
"unrecognized character after (?<", |
"unrecognized character after (?<", |
| 192 |
/* 25 */ |
/* 25 */ |
| 193 |
"lookbehind assertion is not fixed length", |
"lookbehind assertion is not fixed length", |
| 194 |
"malformed number after (?(", |
"malformed number or name after (?(", |
| 195 |
"conditional group contains more than two branches", |
"conditional group contains more than two branches", |
| 196 |
"assertion expected after (?(", |
"assertion expected after (?(", |
| 197 |
"(?R or (?digits must be followed by )", |
"(?R or (?digits must be followed by )", |
| 211 |
"recursive call could loop indefinitely", |
"recursive call could loop indefinitely", |
| 212 |
"unrecognized character after (?P", |
"unrecognized character after (?P", |
| 213 |
"syntax error after (?P", |
"syntax error after (?P", |
| 214 |
"two named groups have the same name", |
"two named subpatterns have the same name", |
| 215 |
"invalid UTF-8 string", |
"invalid UTF-8 string", |
| 216 |
/* 45 */ |
/* 45 */ |
| 217 |
"support for \\P, \\p, and \\X has not been compiled", |
"support for \\P, \\p, and \\X has not been compiled", |
| 218 |
"malformed \\P or \\p sequence", |
"malformed \\P or \\p sequence", |
| 219 |
"unknown property name after \\P or \\p" |
"unknown property name after \\P or \\p", |
| 220 |
|
"subpattern name is too long (maximum 32 characters)", |
| 221 |
|
"too many named subpatterns (maximum 10,000)", |
| 222 |
|
/* 50 */ |
| 223 |
|
"repeated subpattern is too long", |
| 224 |
|
"octal value is greater than \\377 (not in UTF-8 mode)" |
| 225 |
}; |
}; |
| 226 |
|
|
| 227 |
|
|
| 466 |
} |
} |
| 467 |
|
|
| 468 |
/* \0 always starts an octal number, but we may drop through to here with a |
/* \0 always starts an octal number, but we may drop through to here with a |
| 469 |
larger first octal digit. */ |
larger first octal digit. The original code used just to take the least |
| 470 |
|
significant 8 bits of octal numbers (I think this is what early Perls used |
| 471 |
|
to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more |
| 472 |
|
than 3 octal digits. */ |
| 473 |
|
|
| 474 |
case '0': |
case '0': |
| 475 |
c -= '0'; |
c -= '0'; |
| 476 |
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') |
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') |
| 477 |
c = c * 8 + *(++ptr) - '0'; |
c = c * 8 + *(++ptr) - '0'; |
| 478 |
c &= 255; /* Take least significant 8 bits */ |
if (!utf8 && c > 255) *errorcodeptr = ERR51; |
| 479 |
break; |
break; |
| 480 |
|
|
| 481 |
/* \x is complicated. \x{ddd} is a character number which can be greater |
/* \x is complicated. \x{ddd} is a character number which can be greater |
| 772 |
|
|
| 773 |
|
|
| 774 |
/************************************************* |
/************************************************* |
| 775 |
|
* Find forward referenced named subpattern * |
| 776 |
|
*************************************************/ |
| 777 |
|
|
| 778 |
|
/* This function scans along a pattern looking for capturing subpatterns, and |
| 779 |
|
counting them. If it finds a named pattern that matches the name it is given, |
| 780 |
|
it returns its number. This is used for forward references to named |
| 781 |
|
subpatterns. We know that if (?P< is encountered, the name will be terminated |
| 782 |
|
by '>' because that is checked in the first pass. |
| 783 |
|
|
| 784 |
|
Arguments: |
| 785 |
|
pointer current position in the pattern |
| 786 |
|
count current count of capturing parens |
| 787 |
|
name name to seek |
| 788 |
|
namelen name length |
| 789 |
|
|
| 790 |
|
Returns: the number of the named subpattern, or -1 if not found |
| 791 |
|
*/ |
| 792 |
|
|
| 793 |
|
static int |
| 794 |
|
find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen) |
| 795 |
|
{ |
| 796 |
|
const uschar *thisname; |
| 797 |
|
for (; *ptr != 0; ptr++) |
| 798 |
|
{ |
| 799 |
|
if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; } |
| 800 |
|
if (*ptr != '(') continue; |
| 801 |
|
if (ptr[1] != '?') { count++; continue; } |
| 802 |
|
if (ptr[2] == '(') { ptr += 2; continue; } |
| 803 |
|
if (ptr[2] != 'P' || ptr[3] != '<') continue; |
| 804 |
|
count++; |
| 805 |
|
ptr += 4; |
| 806 |
|
thisname = ptr; |
| 807 |
|
while (*ptr != '>') ptr++; |
| 808 |
|
if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0) |
| 809 |
|
return count; |
| 810 |
|
} |
| 811 |
|
return -1; |
| 812 |
|
} |
| 813 |
|
|
| 814 |
|
|
| 815 |
|
|
| 816 |
|
/************************************************* |
| 817 |
* Find first significant op code * |
* Find first significant op code * |
| 818 |
*************************************************/ |
*************************************************/ |
| 819 |
|
|
| 968 |
|
|
| 969 |
case OP_CHAR: |
case OP_CHAR: |
| 970 |
case OP_CHARNC: |
case OP_CHARNC: |
| 971 |
|
case OP_NOT: |
| 972 |
branchlength++; |
branchlength++; |
| 973 |
cc += 2; |
cc += 2; |
| 974 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1083 |
static const uschar * |
static const uschar * |
| 1084 |
find_bracket(const uschar *code, BOOL utf8, int number) |
find_bracket(const uschar *code, BOOL utf8, int number) |
| 1085 |
{ |
{ |
|
#ifndef SUPPORT_UTF8 |
|
|
utf8 = utf8; /* Stop pedantic compilers complaining */ |
|
|
#endif |
|
|
|
|
| 1086 |
for (;;) |
for (;;) |
| 1087 |
{ |
{ |
| 1088 |
register int c = *code; |
register int c = *code; |
| 1089 |
if (c == OP_END) return NULL; |
if (c == OP_END) return NULL; |
| 1090 |
|
|
| 1091 |
|
/* XCLASS is used for classes that cannot be represented just by a bit |
| 1092 |
|
map. This includes negated single high-valued characters. The length in |
| 1093 |
|
the table is zero; the actual length is stored in the compiled code. */ |
| 1094 |
|
|
| 1095 |
|
if (c == OP_XCLASS) code += GET(code, 1); |
| 1096 |
|
|
| 1097 |
|
/* Handle bracketed group */ |
| 1098 |
|
|
| 1099 |
else if (c > OP_BRA) |
else if (c > OP_BRA) |
| 1100 |
{ |
{ |
| 1101 |
int n = c - OP_BRA; |
int n = c - OP_BRA; |
| 1103 |
if (n == number) return (uschar *)code; |
if (n == number) return (uschar *)code; |
| 1104 |
code += _pcre_OP_lengths[OP_BRA]; |
code += _pcre_OP_lengths[OP_BRA]; |
| 1105 |
} |
} |
| 1106 |
|
|
| 1107 |
|
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
| 1108 |
|
that are followed by a character may be followed by a multi-byte character. |
| 1109 |
|
The length in the table is a minimum, so we have to scan along to skip the |
| 1110 |
|
extra bytes. All opcodes are less than 128, so we can use relatively |
| 1111 |
|
efficient code. */ |
| 1112 |
|
|
| 1113 |
else |
else |
| 1114 |
{ |
{ |
| 1115 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
|
|
by a multi-byte character. The length in the table is a minimum, so we have |
|
|
to scan along to skip the extra bytes. All opcodes are less than 128, so we |
|
|
can use relatively efficient code. */ |
|
|
|
|
| 1116 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1117 |
{ |
{ |
| 1118 |
case OP_CHAR: |
case OP_CHAR: |
| 1128 |
case OP_MINQUERY: |
case OP_MINQUERY: |
| 1129 |
while ((*code & 0xc0) == 0x80) code++; |
while ((*code & 0xc0) == 0x80) code++; |
| 1130 |
break; |
break; |
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit |
|
|
map. This includes negated single high-valued characters. The length in |
|
|
the table is zero; the actual length is stored in the compiled code. */ |
|
|
|
|
|
case OP_XCLASS: |
|
|
code += GET(code, 1) + 1; |
|
|
break; |
|
| 1131 |
} |
} |
|
#endif |
|
| 1132 |
} |
} |
| 1133 |
} |
} |
| 1134 |
} |
} |
| 1152 |
static const uschar * |
static const uschar * |
| 1153 |
find_recurse(const uschar *code, BOOL utf8) |
find_recurse(const uschar *code, BOOL utf8) |
| 1154 |
{ |
{ |
|
#ifndef SUPPORT_UTF8 |
|
|
utf8 = utf8; /* Stop pedantic compilers complaining */ |
|
|
#endif |
|
|
|
|
| 1155 |
for (;;) |
for (;;) |
| 1156 |
{ |
{ |
| 1157 |
register int c = *code; |
register int c = *code; |
| 1158 |
if (c == OP_END) return NULL; |
if (c == OP_END) return NULL; |
| 1159 |
else if (c == OP_RECURSE) return code; |
if (c == OP_RECURSE) return code; |
| 1160 |
|
|
| 1161 |
|
/* XCLASS is used for classes that cannot be represented just by a bit |
| 1162 |
|
map. This includes negated single high-valued characters. The length in |
| 1163 |
|
the table is zero; the actual length is stored in the compiled code. */ |
| 1164 |
|
|
| 1165 |
|
if (c == OP_XCLASS) code += GET(code, 1); |
| 1166 |
|
|
| 1167 |
|
/* All bracketed groups have the same length. */ |
| 1168 |
|
|
| 1169 |
else if (c > OP_BRA) |
else if (c > OP_BRA) |
| 1170 |
{ |
{ |
| 1171 |
code += _pcre_OP_lengths[OP_BRA]; |
code += _pcre_OP_lengths[OP_BRA]; |
| 1172 |
} |
} |
| 1173 |
|
|
| 1174 |
|
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
| 1175 |
|
that are followed by a character may be followed by a multi-byte character. |
| 1176 |
|
The length in the table is a minimum, so we have to scan along to skip the |
| 1177 |
|
extra bytes. All opcodes are less than 128, so we can use relatively |
| 1178 |
|
efficient code. */ |
| 1179 |
|
|
| 1180 |
else |
else |
| 1181 |
{ |
{ |
| 1182 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
|
|
by a multi-byte character. The length in the table is a minimum, so we have |
|
|
to scan along to skip the extra bytes. All opcodes are less than 128, so we |
|
|
can use relatively efficient code. */ |
|
|
|
|
| 1183 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1184 |
{ |
{ |
| 1185 |
case OP_CHAR: |
case OP_CHAR: |
| 1195 |
case OP_MINQUERY: |
case OP_MINQUERY: |
| 1196 |
while ((*code & 0xc0) == 0x80) code++; |
while ((*code & 0xc0) == 0x80) code++; |
| 1197 |
break; |
break; |
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit |
|
|
map. This includes negated single high-valued characters. The length in |
|
|
the table is zero; the actual length is stored in the compiled code. */ |
|
|
|
|
|
case OP_XCLASS: |
|
|
code += GET(code, 1) + 1; |
|
|
break; |
|
| 1198 |
} |
} |
|
#endif |
|
| 1199 |
} |
} |
| 1200 |
} |
} |
| 1201 |
} |
} |
| 1611 |
int firstbyte, reqbyte; |
int firstbyte, reqbyte; |
| 1612 |
int zeroreqbyte, zerofirstbyte; |
int zeroreqbyte, zerofirstbyte; |
| 1613 |
int req_caseopt, reqvary, tempreqvary; |
int req_caseopt, reqvary, tempreqvary; |
|
int condcount = 0; |
|
| 1614 |
int options = *optionsptr; |
int options = *optionsptr; |
| 1615 |
int after_manual_callout = 0; |
int after_manual_callout = 0; |
| 1616 |
register int c; |
register int c; |
| 1724 |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
| 1725 |
if (c == '#') |
if (c == '#') |
| 1726 |
{ |
{ |
| 1727 |
/* The space before the ; is to avoid a warning on a silly compiler |
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; |
| 1728 |
on the Macintosh. */ |
if (*ptr != 0) |
| 1729 |
while ((c = *(++ptr)) != 0 && c != NEWLINE) ; |
{ |
| 1730 |
if (c != 0) continue; /* Else fall through to handle end of string */ |
ptr += cd->nllen - 1; |
| 1731 |
|
continue; |
| 1732 |
|
} |
| 1733 |
|
/* Else fall through to handle end of string */ |
| 1734 |
|
c = 0; |
| 1735 |
} |
} |
| 1736 |
} |
} |
| 1737 |
|
|
| 2896 |
case '(': |
case '(': |
| 2897 |
bravalue = OP_COND; /* Conditional group */ |
bravalue = OP_COND; /* Conditional group */ |
| 2898 |
|
|
| 2899 |
/* Condition to test for recursion */ |
/* A condition can be a number, referring to a numbered group, a name, |
| 2900 |
|
referring to a named group, 'R', referring to recursion, or an |
| 2901 |
|
assertion. There are two unfortunate ambiguities, caused by history. |
| 2902 |
|
(a) 'R' can be the recursive thing or the name 'R', and (b) a number |
| 2903 |
|
could be a name that consists of digits. In both cases, we look for a |
| 2904 |
|
name first; if not found, we try the other cases. If the first |
| 2905 |
|
character after (?( is a word character, we know the rest up to ) will |
| 2906 |
|
also be word characters because the syntax was checked in the first |
| 2907 |
|
pass. */ |
| 2908 |
|
|
| 2909 |
if (ptr[1] == 'R') |
if ((cd->ctypes[ptr[1]] & ctype_word) != 0) |
| 2910 |
{ |
{ |
| 2911 |
code[1+LINK_SIZE] = OP_CREF; |
int i, namelen; |
| 2912 |
PUT2(code, 2+LINK_SIZE, CREF_RECURSE); |
int condref = 0; |
| 2913 |
|
const uschar *name; |
| 2914 |
|
uschar *slot = cd->name_table; |
| 2915 |
|
|
| 2916 |
|
/* This is needed for all successful cases. */ |
| 2917 |
|
|
| 2918 |
skipbytes = 3; |
skipbytes = 3; |
|
ptr += 3; |
|
|
} |
|
| 2919 |
|
|
| 2920 |
/* Condition to test for a numbered subpattern match. We know that |
/* Read the name, but also get it as a number if it's all digits */ |
|
if a digit follows ( then there will just be digits until ) because |
|
|
the syntax was checked in the first pass. */ |
|
| 2921 |
|
|
| 2922 |
else if ((digitab[ptr[1]] && ctype_digit) != 0) |
name = ++ptr; |
| 2923 |
{ |
while (*ptr != ')') |
|
int condref; /* Don't amalgamate; some compilers */ |
|
|
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ |
|
|
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; |
|
|
if (condref == 0) |
|
| 2924 |
{ |
{ |
| 2925 |
*errorcodeptr = ERR35; |
if (condref >= 0) |
| 2926 |
goto FAILED; |
condref = ((digitab[*ptr] & ctype_digit) != 0)? |
| 2927 |
|
condref * 10 + *ptr - '0' : -1; |
| 2928 |
|
ptr++; |
| 2929 |
} |
} |
| 2930 |
|
namelen = ptr - name; |
| 2931 |
ptr++; |
ptr++; |
| 2932 |
code[1+LINK_SIZE] = OP_CREF; |
|
| 2933 |
PUT2(code, 2+LINK_SIZE, condref); |
for (i = 0; i < cd->names_found; i++) |
| 2934 |
skipbytes = 3; |
{ |
| 2935 |
|
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
| 2936 |
|
slot += cd->name_entry_size; |
| 2937 |
|
} |
| 2938 |
|
|
| 2939 |
|
/* Found a previous named subpattern */ |
| 2940 |
|
|
| 2941 |
|
if (i < cd->names_found) |
| 2942 |
|
{ |
| 2943 |
|
condref = GET2(slot, 0); |
| 2944 |
|
code[1+LINK_SIZE] = OP_CREF; |
| 2945 |
|
PUT2(code, 2+LINK_SIZE, condref); |
| 2946 |
|
} |
| 2947 |
|
|
| 2948 |
|
/* Search the pattern for a forward reference */ |
| 2949 |
|
|
| 2950 |
|
else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0) |
| 2951 |
|
{ |
| 2952 |
|
code[1+LINK_SIZE] = OP_CREF; |
| 2953 |
|
PUT2(code, 2+LINK_SIZE, i); |
| 2954 |
|
} |
| 2955 |
|
|
| 2956 |
|
/* Check for 'R' for recursion */ |
| 2957 |
|
|
| 2958 |
|
else if (namelen == 1 && *name == 'R') |
| 2959 |
|
{ |
| 2960 |
|
code[1+LINK_SIZE] = OP_CREF; |
| 2961 |
|
PUT2(code, 2+LINK_SIZE, CREF_RECURSE); |
| 2962 |
|
} |
| 2963 |
|
|
| 2964 |
|
/* Check for a subpattern number */ |
| 2965 |
|
|
| 2966 |
|
else if (condref > 0) |
| 2967 |
|
{ |
| 2968 |
|
code[1+LINK_SIZE] = OP_CREF; |
| 2969 |
|
PUT2(code, 2+LINK_SIZE, condref); |
| 2970 |
|
} |
| 2971 |
|
|
| 2972 |
|
/* Either an unidentified subpattern, or a reference to (?(0) */ |
| 2973 |
|
|
| 2974 |
|
else |
| 2975 |
|
{ |
| 2976 |
|
*errorcodeptr = (condref == 0)? ERR35: ERR15; |
| 2977 |
|
goto FAILED; |
| 2978 |
|
} |
| 2979 |
} |
} |
| 2980 |
|
|
| 2981 |
/* For conditions that are assertions, we just fall through, having |
/* For conditions that are assertions, we just fall through, having |
| 2982 |
set bravalue above. */ |
set bravalue above. */ |
| 2983 |
|
|
| 2984 |
break; |
break; |
| 2985 |
|
|
| 2986 |
case '=': /* Positive lookahead */ |
case '=': /* Positive lookahead */ |
| 3052 |
{ |
{ |
| 3053 |
if (slot[2+namelen] == 0) |
if (slot[2+namelen] == 0) |
| 3054 |
{ |
{ |
| 3055 |
*errorcodeptr = ERR43; |
if ((options & PCRE_DUPNAMES) == 0) |
| 3056 |
goto FAILED; |
{ |
| 3057 |
|
*errorcodeptr = ERR43; |
| 3058 |
|
goto FAILED; |
| 3059 |
|
} |
| 3060 |
} |
} |
| 3061 |
crc = -1; /* Current name is substring */ |
else crc = -1; /* Current name is substring */ |
| 3062 |
} |
} |
| 3063 |
if (crc < 0) |
if (crc < 0) |
| 3064 |
{ |
{ |
| 3091 |
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
| 3092 |
slot += cd->name_entry_size; |
slot += cd->name_entry_size; |
| 3093 |
} |
} |
| 3094 |
if (i >= cd->names_found) |
|
| 3095 |
|
if (i < cd->names_found) /* Back reference */ |
| 3096 |
|
{ |
| 3097 |
|
recno = GET2(slot, 0); |
| 3098 |
|
} |
| 3099 |
|
else if ((recno = /* Forward back reference */ |
| 3100 |
|
find_named_parens(ptr, *brackets, name, namelen)) <= 0) |
| 3101 |
{ |
{ |
| 3102 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 3103 |
goto FAILED; |
goto FAILED; |
| 3104 |
} |
} |
| 3105 |
|
|
|
recno = GET2(slot, 0); |
|
|
|
|
| 3106 |
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ |
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ |
| 3107 |
|
|
| 3108 |
/* Back reference */ |
/* Back reference */ |
| 3142 |
regex in case it doesn't exist. */ |
regex in case it doesn't exist. */ |
| 3143 |
|
|
| 3144 |
*code = OP_END; |
*code = OP_END; |
| 3145 |
called = (recno == 0)? |
called = (recno == 0)? cd->start_code : |
| 3146 |
cd->start_code : find_bracket(cd->start_code, utf8, recno); |
find_bracket(cd->start_code, utf8, recno); |
|
|
|
| 3147 |
if (called == NULL) |
if (called == NULL) |
| 3148 |
{ |
{ |
| 3149 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 3190 |
case '-': optset = &unset; break; |
case '-': optset = &unset; break; |
| 3191 |
|
|
| 3192 |
case 'i': *optset |= PCRE_CASELESS; break; |
case 'i': *optset |= PCRE_CASELESS; break; |
| 3193 |
|
case 'J': *optset |= PCRE_DUPNAMES; break; |
| 3194 |
case 'm': *optset |= PCRE_MULTILINE; break; |
case 'm': *optset |= PCRE_MULTILINE; break; |
| 3195 |
case 's': *optset |= PCRE_DOTALL; break; |
case 's': *optset |= PCRE_DOTALL; break; |
| 3196 |
case 'x': *optset |= PCRE_EXTENDED; break; |
case 'x': *optset |= PCRE_EXTENDED; break; |
| 3307 |
else if (bravalue == OP_COND) |
else if (bravalue == OP_COND) |
| 3308 |
{ |
{ |
| 3309 |
uschar *tc = code; |
uschar *tc = code; |
| 3310 |
condcount = 0; |
int condcount = 0; |
| 3311 |
|
|
| 3312 |
do { |
do { |
| 3313 |
condcount++; |
condcount++; |
| 4012 |
} |
} |
| 4013 |
|
|
| 4014 |
|
|
| 4015 |
|
|
| 4016 |
PCRE_DATA_SCOPE pcre * |
PCRE_DATA_SCOPE pcre * |
| 4017 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
| 4018 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
| 4019 |
{ |
{ |
| 4020 |
real_pcre *re; |
real_pcre *re; |
| 4021 |
int length = 1 + LINK_SIZE; /* For initial BRA plus length */ |
int length = 1 + LINK_SIZE; /* For initial BRA plus length */ |
| 4022 |
int c, firstbyte, reqbyte; |
int c, firstbyte, reqbyte, newline; |
| 4023 |
int bracount = 0; |
int bracount = 0; |
| 4024 |
int branch_extra = 0; |
int branch_extra = 0; |
| 4025 |
int branch_newextra; |
int branch_newextra; |
| 4040 |
const uschar *codestart; |
const uschar *codestart; |
| 4041 |
const uschar *ptr; |
const uschar *ptr; |
| 4042 |
compile_data compile_block; |
compile_data compile_block; |
| 4043 |
|
compile_data *cd = &compile_block; |
| 4044 |
int brastack[BRASTACK_SIZE]; |
int brastack[BRASTACK_SIZE]; |
| 4045 |
uschar bralenstack[BRASTACK_SIZE]; |
uschar bralenstack[BRASTACK_SIZE]; |
| 4046 |
|
|
| 4094 |
/* Set up pointers to the individual character tables */ |
/* Set up pointers to the individual character tables */ |
| 4095 |
|
|
| 4096 |
if (tables == NULL) tables = _pcre_default_tables; |
if (tables == NULL) tables = _pcre_default_tables; |
| 4097 |
compile_block.lcc = tables + lcc_offset; |
cd->lcc = tables + lcc_offset; |
| 4098 |
compile_block.fcc = tables + fcc_offset; |
cd->fcc = tables + fcc_offset; |
| 4099 |
compile_block.cbits = tables + cbits_offset; |
cd->cbits = tables + cbits_offset; |
| 4100 |
compile_block.ctypes = tables + ctypes_offset; |
cd->ctypes = tables + ctypes_offset; |
| 4101 |
|
|
| 4102 |
|
/* Handle different types of newline. The two bits give four cases. The current |
| 4103 |
|
code allows for one- or two-byte sequences. */ |
| 4104 |
|
|
| 4105 |
|
switch (options & PCRE_NEWLINE_CRLF) |
| 4106 |
|
{ |
| 4107 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
| 4108 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
| 4109 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
| 4110 |
|
case PCRE_NEWLINE_CR+ |
| 4111 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
| 4112 |
|
} |
| 4113 |
|
|
| 4114 |
|
if (newline > 255) |
| 4115 |
|
{ |
| 4116 |
|
cd->nllen = 2; |
| 4117 |
|
cd->nl[0] = (newline >> 8) & 255; |
| 4118 |
|
cd->nl[1] = newline & 255; |
| 4119 |
|
} |
| 4120 |
|
else |
| 4121 |
|
{ |
| 4122 |
|
cd->nllen = 1; |
| 4123 |
|
cd->nl[0] = newline; |
| 4124 |
|
} |
| 4125 |
|
|
| 4126 |
/* Maximum back reference and backref bitmap. This is updated for numeric |
/* Maximum back reference and backref bitmap. This is updated for numeric |
| 4127 |
references during the first pass, but for named references during the actual |
references during the first pass, but for named references during the actual |
| 4128 |
compile pass. The bitmap records up to 31 back references to help in deciding |
compile pass. The bitmap records up to 31 back references to help in deciding |
| 4129 |
whether (.*) can be treated as anchored or not. */ |
whether (.*) can be treated as anchored or not. */ |
| 4130 |
|
|
| 4131 |
compile_block.top_backref = 0; |
cd->top_backref = 0; |
| 4132 |
compile_block.backref_map = 0; |
cd->backref_map = 0; |
| 4133 |
|
|
| 4134 |
/* Reflect pattern for debugging output */ |
/* Reflect pattern for debugging output */ |
| 4135 |
|
|
| 4163 |
|
|
| 4164 |
if ((options & PCRE_EXTENDED) != 0) |
if ((options & PCRE_EXTENDED) != 0) |
| 4165 |
{ |
{ |
| 4166 |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
| 4167 |
if (c == '#') |
if (c == '#') |
| 4168 |
{ |
{ |
| 4169 |
/* The space before the ; is to avoid a warning on a silly compiler |
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; |
| 4170 |
on the Macintosh. */ |
if (*ptr != 0) |
| 4171 |
while ((c = *(++ptr)) != 0 && c != NEWLINE) ; |
{ |
| 4172 |
if (c == 0) break; |
ptr += cd->nllen - 1; |
| 4173 |
continue; |
continue; |
| 4174 |
|
} |
| 4175 |
|
break; /* End loop at end of pattern */ |
| 4176 |
} |
} |
| 4177 |
} |
} |
| 4178 |
|
|
| 4262 |
if (c <= -ESC_REF) |
if (c <= -ESC_REF) |
| 4263 |
{ |
{ |
| 4264 |
int refnum = -c - ESC_REF; |
int refnum = -c - ESC_REF; |
| 4265 |
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; |
cd->backref_map |= (refnum < 32)? (1 << refnum) : 1; |
| 4266 |
if (refnum > compile_block.top_backref) |
if (refnum > cd->top_backref) |
| 4267 |
compile_block.top_backref = refnum; |
cd->top_backref = refnum; |
| 4268 |
length += 2; /* For single back reference */ |
length += 2; /* For single back reference */ |
| 4269 |
if (ptr[1] == '{' && is_counted_repeat(ptr+2)) |
if (ptr[1] == '{' && is_counted_repeat(ptr+2)) |
| 4270 |
{ |
{ |
| 4418 |
/* Check the syntax for POSIX stuff. The bits we actually handle are |
/* Check the syntax for POSIX stuff. The bits we actually handle are |
| 4419 |
checked during the real compile phase. */ |
checked during the real compile phase. */ |
| 4420 |
|
|
| 4421 |
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) |
else if (*ptr == '[' && |
| 4422 |
|
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
| 4423 |
|
check_posix_syntax(ptr, &ptr, cd)) |
| 4424 |
{ |
{ |
| 4425 |
ptr++; |
ptr++; |
| 4426 |
class_optcount = 10; /* Make sure > 1 */ |
class_optcount = 10; /* Make sure > 1 */ |
| 4653 |
ptr += 2; |
ptr += 2; |
| 4654 |
break; |
break; |
| 4655 |
|
|
| 4656 |
|
/* Named subpatterns are an extension copied from Python */ |
| 4657 |
|
|
| 4658 |
|
case 'P': |
| 4659 |
|
ptr += 3; |
| 4660 |
|
|
| 4661 |
|
/* Handle the definition of a named subpattern */ |
| 4662 |
|
|
| 4663 |
|
if (*ptr == '<') |
| 4664 |
|
{ |
| 4665 |
|
const uschar *p; /* Don't amalgamate; some compilers */ |
| 4666 |
|
p = ++ptr; /* grumble at autoincrement in declaration */ |
| 4667 |
|
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
| 4668 |
|
if (*ptr != '>') |
| 4669 |
|
{ |
| 4670 |
|
errorcode = ERR42; |
| 4671 |
|
goto PCRE_ERROR_RETURN; |
| 4672 |
|
} |
| 4673 |
|
name_count++; |
| 4674 |
|
if (name_count > MAX_NAME_COUNT) |
| 4675 |
|
{ |
| 4676 |
|
errorcode = ERR49; |
| 4677 |
|
goto PCRE_ERROR_RETURN; |
| 4678 |
|
} |
| 4679 |
|
if (ptr - p > max_name_size) |
| 4680 |
|
{ |
| 4681 |
|
max_name_size = (ptr - p); |
| 4682 |
|
if (max_name_size > MAX_NAME_SIZE) |
| 4683 |
|
{ |
| 4684 |
|
errorcode = ERR48; |
| 4685 |
|
goto PCRE_ERROR_RETURN; |
| 4686 |
|
} |
| 4687 |
|
} |
| 4688 |
|
capturing = TRUE; /* Named parentheses are always capturing */ |
| 4689 |
|
break; /* Go handle capturing parentheses */ |
| 4690 |
|
} |
| 4691 |
|
|
| 4692 |
|
/* Handle back references and recursive calls to named subpatterns */ |
| 4693 |
|
|
| 4694 |
|
if (*ptr == '=' || *ptr == '>') |
| 4695 |
|
{ |
| 4696 |
|
length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */ |
| 4697 |
|
while ((cd->ctypes[*(++ptr)] & ctype_word) != 0); |
| 4698 |
|
if (*ptr != ')') |
| 4699 |
|
{ |
| 4700 |
|
errorcode = ERR42; |
| 4701 |
|
goto PCRE_ERROR_RETURN; |
| 4702 |
|
} |
| 4703 |
|
goto RECURSE_CHECK_QUANTIFIED; |
| 4704 |
|
} |
| 4705 |
|
|
| 4706 |
|
/* Unknown character after (?P */ |
| 4707 |
|
|
| 4708 |
|
errorcode = ERR41; |
| 4709 |
|
goto PCRE_ERROR_RETURN; |
| 4710 |
|
|
| 4711 |
/* (?R) specifies a recursive call to the regex, which is an extension |
/* (?R) specifies a recursive call to the regex, which is an extension |
| 4712 |
to provide the facility which can be obtained by (?p{perl-code}) in |
to provide the facility which can be obtained by (?p{perl-code}) in |
| 4713 |
Perl 5.6. In Perl 5.8 this has become (??{perl-code}). |
Perl 5.6. In Perl 5.8 this has become (??{perl-code}). |
| 4733 |
|
|
| 4734 |
/* If this item is quantified, it will get wrapped inside brackets so |
/* If this item is quantified, it will get wrapped inside brackets so |
| 4735 |
as to use the code for quantified brackets. We jump down and use the |
as to use the code for quantified brackets. We jump down and use the |
| 4736 |
code that handles this for real brackets. */ |
code that handles this for real brackets. Come here from code for |
| 4737 |
|
named recursions/subroutines. */ |
| 4738 |
|
|
| 4739 |
|
RECURSE_CHECK_QUANTIFIED: |
| 4740 |
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') |
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') |
| 4741 |
{ |
{ |
| 4742 |
length += 2 + 2 * LINK_SIZE; /* to make bracketed */ |
length += 2 + 2 * LINK_SIZE; /* to make bracketed */ |
| 4760 |
length += 2 + 2*LINK_SIZE; |
length += 2 + 2*LINK_SIZE; |
| 4761 |
continue; |
continue; |
| 4762 |
|
|
|
/* Named subpatterns are an extension copied from Python */ |
|
|
|
|
|
case 'P': |
|
|
ptr += 3; |
|
|
|
|
|
/* Handle the definition of a named subpattern */ |
|
|
|
|
|
if (*ptr == '<') |
|
|
{ |
|
|
const uschar *p; /* Don't amalgamate; some compilers */ |
|
|
p = ++ptr; /* grumble at autoincrement in declaration */ |
|
|
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; |
|
|
if (*ptr != '>') |
|
|
{ |
|
|
errorcode = ERR42; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
} |
|
|
name_count++; |
|
|
if (ptr - p > max_name_size) max_name_size = (ptr - p); |
|
|
capturing = TRUE; /* Named parentheses are always capturing */ |
|
|
break; |
|
|
} |
|
|
|
|
|
/* Handle back references and recursive calls to named subpatterns */ |
|
|
|
|
|
if (*ptr == '=' || *ptr == '>') |
|
|
{ |
|
|
length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */ |
|
|
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); |
|
|
if (*ptr != ')') |
|
|
{ |
|
|
errorcode = ERR42; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
} |
|
|
break; |
|
|
} |
|
|
|
|
|
/* Unknown character after (?P */ |
|
|
|
|
|
errorcode = ERR41; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
|
|
| 4763 |
/* Lookbehinds are in Perl from version 5.005 */ |
/* Lookbehinds are in Perl from version 5.005 */ |
| 4764 |
|
|
| 4765 |
case '<': |
case '<': |
| 4775 |
|
|
| 4776 |
/* Conditionals are in Perl from version 5.005. The bracket must either |
/* Conditionals are in Perl from version 5.005. The bracket must either |
| 4777 |
be followed by a number (for bracket reference) or by an assertion |
be followed by a number (for bracket reference) or by an assertion |
| 4778 |
group, or (a PCRE extension) by 'R' for a recursion test. */ |
group. PCRE extends this by allowing a name to reference a named group; |
| 4779 |
|
unfortunately, previously 'R' was implemented for a recursion test. |
| 4780 |
|
When this is compiled, we look for the named group 'R' first. At this |
| 4781 |
|
point we just do a basic syntax check. */ |
| 4782 |
|
|
| 4783 |
case '(': |
case '(': |
| 4784 |
if (ptr[3] == 'R' && ptr[4] == ')') |
if ((cd->ctypes[ptr[3]] & ctype_word) != 0) |
|
{ |
|
|
ptr += 4; |
|
|
length += 3; |
|
|
} |
|
|
else if ((digitab[ptr[3]] & ctype_digit) != 0) |
|
| 4785 |
{ |
{ |
| 4786 |
ptr += 4; |
ptr += 4; |
| 4787 |
length += 3; |
length += 3; |
| 4788 |
while ((digitab[*ptr] & ctype_digit) != 0) ptr++; |
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
| 4789 |
if (*ptr != ')') |
if (*ptr != ')') |
| 4790 |
{ |
{ |
| 4791 |
errorcode = ERR26; |
errorcode = ERR26; |
| 4824 |
*optset |= PCRE_CASELESS; |
*optset |= PCRE_CASELESS; |
| 4825 |
continue; |
continue; |
| 4826 |
|
|
| 4827 |
|
case 'J': |
| 4828 |
|
*optset |= PCRE_DUPNAMES; |
| 4829 |
|
options |= PCRE_JCHANGED; /* Record that it changed */ |
| 4830 |
|
continue; |
| 4831 |
|
|
| 4832 |
case 'm': |
case 'm': |
| 4833 |
*optset |= PCRE_MULTILINE; |
*optset |= PCRE_MULTILINE; |
| 4834 |
continue; |
continue; |
| 4894 |
will lead to an over-estimate on the length, but this shouldn't |
will lead to an over-estimate on the length, but this shouldn't |
| 4895 |
matter very much. We also have to allow for resetting options at |
matter very much. We also have to allow for resetting options at |
| 4896 |
the start of any alternations, which we do by setting |
the start of any alternations, which we do by setting |
| 4897 |
branch_newextra to 2. Finally, we record whether the case-dependent |
branch_newextra to 2. */ |
|
flag ever changes within the regex. This is used by the "required |
|
|
character" code. */ |
|
| 4898 |
|
|
| 4899 |
case ':': |
case ':': |
| 4900 |
if (((set|unset) & PCRE_IMS) != 0) |
if (((set|unset) & PCRE_IMS) != 0) |
| 4901 |
{ |
{ |
| 4902 |
length += 4; |
length += 4; |
| 4903 |
branch_newextra = 2; |
branch_newextra = 2; |
|
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; |
|
| 4904 |
} |
} |
| 4905 |
goto END_OPTIONS; |
goto END_OPTIONS; |
| 4906 |
|
|
| 4980 |
{ |
{ |
| 4981 |
duplength = length - brastack[--brastackptr]; |
duplength = length - brastack[--brastackptr]; |
| 4982 |
branch_extra = bralenstack[brastackptr]; |
branch_extra = bralenstack[brastackptr]; |
| 4983 |
|
/* This is a paranoid check to stop integer overflow later on */ |
| 4984 |
|
if (duplength > MAX_DUPLENGTH) |
| 4985 |
|
{ |
| 4986 |
|
errorcode = ERR50; |
| 4987 |
|
goto PCRE_ERROR_RETURN; |
| 4988 |
|
} |
| 4989 |
} |
} |
| 4990 |
else duplength = 0; |
else duplength = 0; |
| 4991 |
|
|
| 5090 |
} |
} |
| 5091 |
|
|
| 5092 |
/* Compute the size of data block needed and get it, either from malloc or |
/* Compute the size of data block needed and get it, either from malloc or |
| 5093 |
externally provided function. */ |
externally provided function. Integer overflow should no longer be possible |
| 5094 |
|
because nowadays we limit the maximum value of name_count and max_name size. */ |
| 5095 |
|
|
| 5096 |
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); |
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); |
| 5097 |
re = (real_pcre *)(pcre_malloc)(size); |
re = (real_pcre *)(pcre_malloc)(size); |
| 5121 |
/* The starting points of the name/number translation table and of the code are |
/* The starting points of the name/number translation table and of the code are |
| 5122 |
passed around in the compile data block. */ |
passed around in the compile data block. */ |
| 5123 |
|
|
| 5124 |
compile_block.names_found = 0; |
cd->names_found = 0; |
| 5125 |
compile_block.name_entry_size = max_name_size + 3; |
cd->name_entry_size = max_name_size + 3; |
| 5126 |
compile_block.name_table = (uschar *)re + re->name_table_offset; |
cd->name_table = (uschar *)re + re->name_table_offset; |
| 5127 |
codestart = compile_block.name_table + re->name_entry_size * re->name_count; |
codestart = cd->name_table + re->name_entry_size * re->name_count; |
| 5128 |
compile_block.start_code = codestart; |
cd->start_code = codestart; |
| 5129 |
compile_block.start_pattern = (const uschar *)pattern; |
cd->start_pattern = (const uschar *)pattern; |
| 5130 |
compile_block.req_varyopt = 0; |
cd->req_varyopt = 0; |
| 5131 |
compile_block.nopartial = FALSE; |
cd->nopartial = FALSE; |
| 5132 |
|
|
| 5133 |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
| 5134 |
error, errorcode will be set non-zero, so we don't need to look at the result |
error, errorcode will be set non-zero, so we don't need to look at the result |
| 5139 |
*code = OP_BRA; |
*code = OP_BRA; |
| 5140 |
bracount = 0; |
bracount = 0; |
| 5141 |
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, |
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, |
| 5142 |
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block); |
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd); |
| 5143 |
re->top_bracket = bracount; |
re->top_bracket = bracount; |
| 5144 |
re->top_backref = compile_block.top_backref; |
re->top_backref = cd->top_backref; |
| 5145 |
|
|
| 5146 |
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL; |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
| 5147 |
|
|
| 5148 |
/* If not reached end of pattern on success, there's an excess bracket. */ |
/* If not reached end of pattern on success, there's an excess bracket. */ |
| 5149 |
|
|
| 5189 |
if ((options & PCRE_ANCHORED) == 0) |
if ((options & PCRE_ANCHORED) == 0) |
| 5190 |
{ |
{ |
| 5191 |
int temp_options = options; |
int temp_options = options; |
| 5192 |
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) |
if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) |
| 5193 |
re->options |= PCRE_ANCHORED; |
re->options |= PCRE_ANCHORED; |
| 5194 |
else |
else |
| 5195 |
{ |
{ |
| 5199 |
{ |
{ |
| 5200 |
int ch = firstbyte & 255; |
int ch = firstbyte & 255; |
| 5201 |
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && |
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && |
| 5202 |
compile_block.fcc[ch] == ch)? ch : firstbyte; |
cd->fcc[ch] == ch)? ch : firstbyte; |
| 5203 |
re->options |= PCRE_FIRSTSET; |
re->options |= PCRE_FIRSTSET; |
| 5204 |
} |
} |
| 5205 |
else if (is_startline(codestart, 0, compile_block.backref_map)) |
else if (is_startline(codestart, 0, cd->backref_map)) |
| 5206 |
re->options |= PCRE_STARTLINE; |
re->options |= PCRE_STARTLINE; |
| 5207 |
} |
} |
| 5208 |
} |
} |
| 5216 |
{ |
{ |
| 5217 |
int ch = reqbyte & 255; |
int ch = reqbyte & 255; |
| 5218 |
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && |
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && |
| 5219 |
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; |
cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; |
| 5220 |
re->options |= PCRE_REQCHSET; |
re->options |= PCRE_REQCHSET; |
| 5221 |
} |
} |
| 5222 |
|
|
| 5230 |
|
|
| 5231 |
if (re->options != 0) |
if (re->options != 0) |
| 5232 |
{ |
{ |
| 5233 |
printf("%s%s%s%s%s%s%s%s%s%s\n", |
printf("%s%s%s%s%s%s%s%s%s\n", |
| 5234 |
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", |
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", |
| 5235 |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
| 5236 |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
|
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", |
|
| 5237 |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
| 5238 |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
| 5239 |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |