| 140 |
#endif |
#endif |
| 141 |
|
|
| 142 |
|
|
| 143 |
|
/* Table of special "verbs" like (*PRUNE) */ |
| 144 |
|
|
| 145 |
|
typedef struct verbitem { |
| 146 |
|
const char *name; |
| 147 |
|
int len; |
| 148 |
|
int op; |
| 149 |
|
} verbitem; |
| 150 |
|
|
| 151 |
|
static verbitem verbs[] = { |
| 152 |
|
{ "ACCEPT", 6, OP_ACCEPT }, |
| 153 |
|
{ "COMMIT", 6, OP_COMMIT }, |
| 154 |
|
{ "F", 1, OP_FAIL }, |
| 155 |
|
{ "FAIL", 4, OP_FAIL }, |
| 156 |
|
{ "PRUNE", 5, OP_PRUNE }, |
| 157 |
|
{ "SKIP", 4, OP_SKIP }, |
| 158 |
|
{ "THEN", 4, OP_THEN } |
| 159 |
|
}; |
| 160 |
|
|
| 161 |
|
static int verbcount = sizeof(verbs)/sizeof(verbitem); |
| 162 |
|
|
| 163 |
|
|
| 164 |
/* Tables of names of POSIX character classes and their lengths. The list is |
/* Tables of names of POSIX character classes and their lengths. The list is |
| 165 |
terminated by a zero length entry. The first three must be alpha, lower, upper, |
terminated by a zero length entry. The first three must be alpha, lower, upper, |
| 166 |
as this is assumed for handling case independence. */ |
as this is assumed for handling case independence. */ |
| 279 |
"repeating a DEFINE group is not allowed", |
"repeating a DEFINE group is not allowed", |
| 280 |
"inconsistent NEWLINE options", |
"inconsistent NEWLINE options", |
| 281 |
"\\g is not followed by a braced name or an optionally braced non-zero number", |
"\\g is not followed by a braced name or an optionally braced non-zero number", |
| 282 |
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number" |
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number", |
| 283 |
|
"(*VERB) with an argument is not supported", |
| 284 |
|
/* 60 */ |
| 285 |
|
"(*VERB) not recognized", |
| 286 |
|
"number is too big" |
| 287 |
}; |
}; |
| 288 |
|
|
| 289 |
|
|
| 440 |
|
|
| 441 |
Returns: zero or positive => a data character |
Returns: zero or positive => a data character |
| 442 |
negative => a special escape sequence |
negative => a special escape sequence |
| 443 |
on error, errorptr is set |
on error, errorcodeptr is set |
| 444 |
*/ |
*/ |
| 445 |
|
|
| 446 |
static int |
static int |
| 524 |
c = 0; |
c = 0; |
| 525 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 526 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 527 |
|
|
| 528 |
|
if (c < 0) |
| 529 |
|
{ |
| 530 |
|
*errorcodeptr = ERR61; |
| 531 |
|
break; |
| 532 |
|
} |
| 533 |
|
|
| 534 |
if (c == 0 || (braced && *(++ptr) != '}')) |
if (c == 0 || (braced && *(++ptr) != '}')) |
| 535 |
{ |
{ |
| 536 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 537 |
return 0; |
break; |
| 538 |
} |
} |
| 539 |
|
|
| 540 |
if (negated) |
if (negated) |
| 542 |
if (c > bracount) |
if (c > bracount) |
| 543 |
{ |
{ |
| 544 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 545 |
return 0; |
break; |
| 546 |
} |
} |
| 547 |
c = bracount - (c - 1); |
c = bracount - (c - 1); |
| 548 |
} |
} |
| 571 |
c -= '0'; |
c -= '0'; |
| 572 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 573 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 574 |
|
if (c < 0) |
| 575 |
|
{ |
| 576 |
|
*errorcodeptr = ERR61; |
| 577 |
|
break; |
| 578 |
|
} |
| 579 |
if (c < 10 || c <= bracount) |
if (c < 10 || c <= bracount) |
| 580 |
{ |
{ |
| 581 |
c = -(ESC_REF + c); |
c = -(ESC_REF + c); |
| 671 |
if (c == 0) |
if (c == 0) |
| 672 |
{ |
{ |
| 673 |
*errorcodeptr = ERR2; |
*errorcodeptr = ERR2; |
| 674 |
return 0; |
break; |
| 675 |
} |
} |
| 676 |
|
|
| 677 |
#ifndef EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 950 |
{ |
{ |
| 951 |
while (*(++ptr) != ']') |
while (*(++ptr) != ']') |
| 952 |
{ |
{ |
| 953 |
|
if (*ptr == 0) return -1; |
| 954 |
if (*ptr == '\\') |
if (*ptr == '\\') |
| 955 |
{ |
{ |
| 956 |
if (*(++ptr) == 0) return -1; |
if (*(++ptr) == 0) return -1; |
| 978 |
/* An opening parens must now be a real metacharacter */ |
/* An opening parens must now be a real metacharacter */ |
| 979 |
|
|
| 980 |
if (*ptr != '(') continue; |
if (*ptr != '(') continue; |
| 981 |
if (ptr[1] != '?') |
if (ptr[1] != '?' && ptr[1] != '*') |
| 982 |
{ |
{ |
| 983 |
count++; |
count++; |
| 984 |
if (name == NULL && count == lorn) return count; |
if (name == NULL && count == lorn) return count; |
| 1106 |
{ |
{ |
| 1107 |
int d; |
int d; |
| 1108 |
register int op = *cc; |
register int op = *cc; |
|
|
|
| 1109 |
switch (op) |
switch (op) |
| 1110 |
{ |
{ |
| 1111 |
case OP_CBRA: |
case OP_CBRA: |
| 1194 |
|
|
| 1195 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
| 1196 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
| 1197 |
|
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; |
| 1198 |
cc += 4; |
cc += 4; |
| 1199 |
break; |
break; |
| 1200 |
|
|
| 1303 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1304 |
} |
} |
| 1305 |
|
|
| 1306 |
/* In UTF-8 mode, opcodes that are followed by a character may be followed by |
/* Otherwise, we can get the item's length from the table, except that for |
| 1307 |
a multi-byte character. The length in the table is a minimum, so we have to |
repeated character types, we have to test for \p and \P, which have an extra |
| 1308 |
arrange to skip the extra bytes. */ |
two bytes of parameters. */ |
| 1309 |
|
|
| 1310 |
else |
else |
| 1311 |
{ |
{ |
| 1312 |
|
switch(c) |
| 1313 |
|
{ |
| 1314 |
|
case OP_TYPESTAR: |
| 1315 |
|
case OP_TYPEMINSTAR: |
| 1316 |
|
case OP_TYPEPLUS: |
| 1317 |
|
case OP_TYPEMINPLUS: |
| 1318 |
|
case OP_TYPEQUERY: |
| 1319 |
|
case OP_TYPEMINQUERY: |
| 1320 |
|
case OP_TYPEUPTO: |
| 1321 |
|
case OP_TYPEMINUPTO: |
| 1322 |
|
case OP_TYPEEXACT: |
| 1323 |
|
case OP_TYPEPOSSTAR: |
| 1324 |
|
case OP_TYPEPOSPLUS: |
| 1325 |
|
case OP_TYPEPOSQUERY: |
| 1326 |
|
case OP_TYPEPOSUPTO: |
| 1327 |
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 1328 |
|
break; |
| 1329 |
|
} |
| 1330 |
|
|
| 1331 |
|
/* Add in the fixed length from the table */ |
| 1332 |
|
|
| 1333 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1334 |
|
|
| 1335 |
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed by |
| 1336 |
|
a multi-byte character. The length in the table is a minimum, so we have to |
| 1337 |
|
arrange to skip the extra bytes. */ |
| 1338 |
|
|
| 1339 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1340 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1341 |
{ |
{ |
| 1386 |
register int c = *code; |
register int c = *code; |
| 1387 |
if (c == OP_END) return NULL; |
if (c == OP_END) return NULL; |
| 1388 |
if (c == OP_RECURSE) return code; |
if (c == OP_RECURSE) return code; |
| 1389 |
|
|
| 1390 |
/* XCLASS is used for classes that cannot be represented just by a bit |
/* XCLASS is used for classes that cannot be represented just by a bit |
| 1391 |
map. This includes negated single high-valued characters. The length in |
map. This includes negated single high-valued characters. The length in |
| 1392 |
the table is zero; the actual length is stored in the compiled code. */ |
the table is zero; the actual length is stored in the compiled code. */ |
| 1393 |
|
|
| 1394 |
if (c == OP_XCLASS) code += GET(code, 1); |
if (c == OP_XCLASS) code += GET(code, 1); |
| 1395 |
|
|
| 1396 |
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
/* Otherwise, we can get the item's length from the table, except that for |
| 1397 |
that are followed by a character may be followed by a multi-byte character. |
repeated character types, we have to test for \p and \P, which have an extra |
| 1398 |
The length in the table is a minimum, so we have to arrange to skip the extra |
two bytes of parameters. */ |
|
bytes. */ |
|
| 1399 |
|
|
| 1400 |
else |
else |
| 1401 |
{ |
{ |
| 1402 |
|
switch(c) |
| 1403 |
|
{ |
| 1404 |
|
case OP_TYPESTAR: |
| 1405 |
|
case OP_TYPEMINSTAR: |
| 1406 |
|
case OP_TYPEPLUS: |
| 1407 |
|
case OP_TYPEMINPLUS: |
| 1408 |
|
case OP_TYPEQUERY: |
| 1409 |
|
case OP_TYPEMINQUERY: |
| 1410 |
|
case OP_TYPEUPTO: |
| 1411 |
|
case OP_TYPEMINUPTO: |
| 1412 |
|
case OP_TYPEEXACT: |
| 1413 |
|
case OP_TYPEPOSSTAR: |
| 1414 |
|
case OP_TYPEPOSPLUS: |
| 1415 |
|
case OP_TYPEPOSQUERY: |
| 1416 |
|
case OP_TYPEPOSUPTO: |
| 1417 |
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 1418 |
|
break; |
| 1419 |
|
} |
| 1420 |
|
|
| 1421 |
|
/* Add in the fixed length from the table */ |
| 1422 |
|
|
| 1423 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1424 |
|
|
| 1425 |
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
| 1426 |
|
by a multi-byte character. The length in the table is a minimum, so we have |
| 1427 |
|
to arrange to skip the extra bytes. */ |
| 1428 |
|
|
| 1429 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1430 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1431 |
{ |
{ |
| 1521 |
|
|
| 1522 |
switch (c) |
switch (c) |
| 1523 |
{ |
{ |
| 1524 |
/* Check for quantifiers after a class */ |
/* Check for quantifiers after a class. XCLASS is used for classes that |
| 1525 |
|
cannot be represented just by a bit map. This includes negated single |
| 1526 |
|
high-valued characters. The length in _pcre_OP_lengths[] is zero; the |
| 1527 |
|
actual length is stored in the compiled code, so we must update "code" |
| 1528 |
|
here. */ |
| 1529 |
|
|
| 1530 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1531 |
case OP_XCLASS: |
case OP_XCLASS: |
| 1532 |
ccode = code + GET(code, 1); |
ccode = code += GET(code, 1); |
| 1533 |
goto CHECK_CLASS_REPEAT; |
goto CHECK_CLASS_REPEAT; |
| 1534 |
#endif |
#endif |
| 1535 |
|
|
| 2546 |
} |
} |
| 2547 |
|
|
| 2548 |
/* If the first character is '^', set the negation flag and skip it. Also, |
/* If the first character is '^', set the negation flag and skip it. Also, |
| 2549 |
if the first few characters (either before or after ^) are \Q\E or \E we |
if the first few characters (either before or after ^) are \Q\E or \E we |
| 2550 |
skip them too. This makes for compatibility with Perl. */ |
skip them too. This makes for compatibility with Perl. */ |
| 2551 |
|
|
| 2552 |
negate_class = FALSE; |
negate_class = FALSE; |
| 2553 |
for (;;) |
for (;;) |
| 2554 |
{ |
{ |
| 2555 |
c = *(++ptr); |
c = *(++ptr); |
| 2556 |
if (c == '\\') |
if (c == '\\') |
| 2557 |
{ |
{ |
| 2558 |
if (ptr[1] == 'E') ptr++; |
if (ptr[1] == 'E') ptr++; |
| 2559 |
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
| 2560 |
else break; |
else break; |
| 2561 |
} |
} |
| 2562 |
else if (!negate_class && c == '^') |
else if (!negate_class && c == '^') |
| 2563 |
negate_class = TRUE; |
negate_class = TRUE; |
| 2564 |
else break; |
else break; |
| 2565 |
} |
} |
| 2566 |
|
|
| 2567 |
/* Keep a count of chars with values < 256 so that we can optimize the case |
/* Keep a count of chars with values < 256 so that we can optimize the case |
| 2568 |
of just a single character (as long as it's < 256). However, For higher |
of just a single character (as long as it's < 256). However, For higher |
| 2722 |
else inescq = TRUE; |
else inescq = TRUE; |
| 2723 |
continue; |
continue; |
| 2724 |
} |
} |
| 2725 |
|
else if (-c == ESC_E) continue; /* Ignore orphan \E */ |
| 2726 |
|
|
| 2727 |
if (c < 0) |
if (c < 0) |
| 2728 |
{ |
{ |
| 3168 |
*errorcodeptr = ERR6; |
*errorcodeptr = ERR6; |
| 3169 |
goto FAILED; |
goto FAILED; |
| 3170 |
} |
} |
| 3171 |
|
|
| 3172 |
/* If class_charcount is 1, we saw precisely one character whose value is |
/* If class_charcount is 1, we saw precisely one character whose value is |
| 3173 |
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we |
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we |
| 3174 |
can optimize the negative case only if there were no characters >= 128 |
can optimize the negative case only if there were no characters >= 128 |
| 3953 |
/* ===================================================================*/ |
/* ===================================================================*/ |
| 3954 |
/* Start of nested parenthesized sub-expression, or comment or lookahead or |
/* Start of nested parenthesized sub-expression, or comment or lookahead or |
| 3955 |
lookbehind or option setting or condition or all the other extended |
lookbehind or option setting or condition or all the other extended |
| 3956 |
parenthesis forms. First deal with the specials; all are introduced by ?, |
parenthesis forms. */ |
|
and the appearance of any of them means that this is not a capturing |
|
|
group. */ |
|
| 3957 |
|
|
| 3958 |
case '(': |
case '(': |
| 3959 |
newoptions = options; |
newoptions = options; |
| 3962 |
save_hwm = cd->hwm; |
save_hwm = cd->hwm; |
| 3963 |
reset_bracount = FALSE; |
reset_bracount = FALSE; |
| 3964 |
|
|
| 3965 |
if (*(++ptr) == '?') |
/* First deal with various "verbs" that can be introduced by '*'. */ |
| 3966 |
|
|
| 3967 |
|
if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) |
| 3968 |
|
{ |
| 3969 |
|
int i, namelen; |
| 3970 |
|
const uschar *name = ++ptr; |
| 3971 |
|
previous = NULL; |
| 3972 |
|
while ((cd->ctypes[*++ptr] & ctype_letter) != 0); |
| 3973 |
|
if (*ptr == ':') |
| 3974 |
|
{ |
| 3975 |
|
*errorcodeptr = ERR59; /* Not supported */ |
| 3976 |
|
goto FAILED; |
| 3977 |
|
} |
| 3978 |
|
if (*ptr != ')') |
| 3979 |
|
{ |
| 3980 |
|
*errorcodeptr = ERR60; |
| 3981 |
|
goto FAILED; |
| 3982 |
|
} |
| 3983 |
|
namelen = ptr - name; |
| 3984 |
|
for (i = 0; i < verbcount; i++) |
| 3985 |
|
{ |
| 3986 |
|
if (namelen == verbs[i].len && |
| 3987 |
|
strncmp((char *)name, verbs[i].name, namelen) == 0) |
| 3988 |
|
{ |
| 3989 |
|
*code = verbs[i].op; |
| 3990 |
|
if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; |
| 3991 |
|
break; |
| 3992 |
|
} |
| 3993 |
|
} |
| 3994 |
|
if (i < verbcount) continue; |
| 3995 |
|
*errorcodeptr = ERR60; |
| 3996 |
|
goto FAILED; |
| 3997 |
|
} |
| 3998 |
|
|
| 3999 |
|
/* Deal with the extended parentheses; all are introduced by '?', and the |
| 4000 |
|
appearance of any of them means that this is not a capturing group. */ |
| 4001 |
|
|
| 4002 |
|
else if (*ptr == '?') |
| 4003 |
{ |
{ |
| 4004 |
int i, set, unset, namelen; |
int i, set, unset, namelen; |
| 4005 |
int *optset; |
int *optset; |
| 4241 |
|
|
| 4242 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
| 4243 |
case '!': /* Negative lookahead */ |
case '!': /* Negative lookahead */ |
|
bravalue = OP_ASSERT_NOT; |
|
| 4244 |
ptr++; |
ptr++; |
| 4245 |
|
if (*ptr == ')') /* Optimize (?!) */ |
| 4246 |
|
{ |
| 4247 |
|
*code++ = OP_FAIL; |
| 4248 |
|
previous = NULL; |
| 4249 |
|
continue; |
| 4250 |
|
} |
| 4251 |
|
bravalue = OP_ASSERT_NOT; |
| 4252 |
break; |
break; |
| 4253 |
|
|
| 4254 |
|
|
| 4797 |
goto FAILED; |
goto FAILED; |
| 4798 |
} |
} |
| 4799 |
|
|
| 4800 |
/* In the pre-compile phase, update the length by the length of the nested |
/* In the pre-compile phase, update the length by the length of the group, |
| 4801 |
group, less the brackets at either end. Then reduce the compiled code to |
less the brackets at either end. Then reduce the compiled code to just a |
| 4802 |
just the brackets so that it doesn't use much memory if it is duplicated by |
set of non-capturing brackets so that it doesn't use much memory if it is |
| 4803 |
a quantifier. */ |
duplicated by a quantifier.*/ |
| 4804 |
|
|
| 4805 |
if (lengthptr != NULL) |
if (lengthptr != NULL) |
| 4806 |
{ |
{ |
| 4810 |
goto FAILED; |
goto FAILED; |
| 4811 |
} |
} |
| 4812 |
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
| 4813 |
code++; |
*code++ = OP_BRA; |
| 4814 |
PUTINC(code, 0, 1 + LINK_SIZE); |
PUTINC(code, 0, 1 + LINK_SIZE); |
| 4815 |
*code++ = OP_KET; |
*code++ = OP_KET; |
| 4816 |
PUTINC(code, 0, 1 + LINK_SIZE); |
PUTINC(code, 0, 1 + LINK_SIZE); |
| 4817 |
|
break; /* No need to waste time with special character handling */ |
| 4818 |
} |
} |
| 4819 |
|
|
| 4820 |
/* Otherwise update the main code pointer to the end of the group. */ |
/* Otherwise update the main code pointer to the end of the group. */ |
| 4821 |
|
|
| 4822 |
else code = tempcode; |
code = tempcode; |
| 4823 |
|
|
| 4824 |
/* For a DEFINE group, required and first character settings are not |
/* For a DEFINE group, required and first character settings are not |
| 4825 |
relevant. */ |
relevant. */ |
| 5841 |
cd->hwm = cworkspace; |
cd->hwm = cworkspace; |
| 5842 |
cd->req_varyopt = 0; |
cd->req_varyopt = 0; |
| 5843 |
cd->nopartial = FALSE; |
cd->nopartial = FALSE; |
| 5844 |
|
cd->had_accept = FALSE; |
| 5845 |
|
|
| 5846 |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
| 5847 |
error, errorcode will be set non-zero, so we don't need to look at the result |
error, errorcode will be set non-zero, so we don't need to look at the result |
| 5856 |
re->top_backref = cd->top_backref; |
re->top_backref = cd->top_backref; |
| 5857 |
|
|
| 5858 |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
| 5859 |
|
if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ |
| 5860 |
|
|
| 5861 |
/* If not reached end of pattern on success, there's an excess bracket. */ |
/* If not reached end of pattern on success, there's an excess bracket. */ |
| 5862 |
|
|