| 42 |
supporting internal functions that are not used by other modules. */ |
supporting internal functions that are not used by other modules. */ |
| 43 |
|
|
| 44 |
|
|
| 45 |
|
#ifdef HAVE_CONFIG_H |
| 46 |
|
#include <config.h> |
| 47 |
|
#endif |
| 48 |
|
|
| 49 |
#define NLBLOCK cd /* Block containing newline information */ |
#define NLBLOCK cd /* Block containing newline information */ |
| 50 |
#define PSSTART start_pattern /* Field containing processed string start */ |
#define PSSTART start_pattern /* Field containing processed string start */ |
| 51 |
#define PSEND end_pattern /* Field containing processed string end */ |
#define PSEND end_pattern /* Field containing processed string end */ |
| 52 |
|
|
|
|
|
| 53 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
| 54 |
|
|
| 55 |
|
|
| 65 |
|
|
| 66 |
#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) |
#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) |
| 67 |
|
|
| 68 |
|
/* Maximum length value to check against when making sure that the integer that |
| 69 |
|
holds the compiled pattern length does not overflow. We make it a bit less than |
| 70 |
|
INT_MAX to allow for adding in group terminating bytes, so that we don't have |
| 71 |
|
to check them every time. */ |
| 72 |
|
|
| 73 |
|
#define OFLOW_MAX (INT_MAX - 20) |
| 74 |
|
|
| 75 |
|
|
| 76 |
/************************************************* |
/************************************************* |
| 77 |
* Code parameters and static tables * |
* Code parameters and static tables * |
| 130 |
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
| 131 |
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
| 132 |
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, |
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, |
| 133 |
/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, |
/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, |
| 134 |
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
| 135 |
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, |
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, |
| 136 |
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
| 140 |
#endif |
#endif |
| 141 |
|
|
| 142 |
|
|
| 143 |
|
/* Table of special "verbs" like (*PRUNE) */ |
| 144 |
|
|
| 145 |
|
typedef struct verbitem { |
| 146 |
|
const char *name; |
| 147 |
|
int len; |
| 148 |
|
int op; |
| 149 |
|
} verbitem; |
| 150 |
|
|
| 151 |
|
static verbitem verbs[] = { |
| 152 |
|
{ "ACCEPT", 6, OP_ACCEPT }, |
| 153 |
|
{ "COMMIT", 6, OP_COMMIT }, |
| 154 |
|
{ "F", 1, OP_FAIL }, |
| 155 |
|
{ "FAIL", 4, OP_FAIL }, |
| 156 |
|
{ "PRUNE", 5, OP_PRUNE }, |
| 157 |
|
{ "SKIP", 4, OP_SKIP }, |
| 158 |
|
{ "THEN", 4, OP_THEN } |
| 159 |
|
}; |
| 160 |
|
|
| 161 |
|
static int verbcount = sizeof(verbs)/sizeof(verbitem); |
| 162 |
|
|
| 163 |
|
|
| 164 |
/* Tables of names of POSIX character classes and their lengths. The list is |
/* Tables of names of POSIX character classes and their lengths. The list is |
| 165 |
terminated by a zero length entry. The first three must be alpha, lower, upper, |
terminated by a zero length entry. The first three must be alpha, lower, upper, |
| 166 |
as this is assumed for handling case independence. */ |
as this is assumed for handling case independence. */ |
| 234 |
"missing ) after comment", |
"missing ) after comment", |
| 235 |
"parentheses nested too deeply", /** DEAD **/ |
"parentheses nested too deeply", /** DEAD **/ |
| 236 |
/* 20 */ |
/* 20 */ |
| 237 |
"regular expression too large", |
"regular expression is too large", |
| 238 |
"failed to get memory", |
"failed to get memory", |
| 239 |
"unmatched parentheses", |
"unmatched parentheses", |
| 240 |
"internal error: code overflow", |
"internal error: code overflow", |
| 270 |
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", |
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", |
| 271 |
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", |
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", |
| 272 |
/* 50 */ |
/* 50 */ |
| 273 |
"repeated subpattern is too long", |
"repeated subpattern is too long", /** DEAD **/ |
| 274 |
"octal value is greater than \\377 (not in UTF-8 mode)", |
"octal value is greater than \\377 (not in UTF-8 mode)", |
| 275 |
"internal error: overran compiling workspace", |
"internal error: overran compiling workspace", |
| 276 |
"internal error: previously-checked referenced subpattern not found", |
"internal error: previously-checked referenced subpattern not found", |
| 279 |
"repeating a DEFINE group is not allowed", |
"repeating a DEFINE group is not allowed", |
| 280 |
"inconsistent NEWLINE options", |
"inconsistent NEWLINE options", |
| 281 |
"\\g is not followed by a braced name or an optionally braced non-zero number", |
"\\g is not followed by a braced name or an optionally braced non-zero number", |
| 282 |
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number" |
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number", |
| 283 |
|
"(*VERB) with an argument is not supported", |
| 284 |
|
/* 60 */ |
| 285 |
|
"(*VERB) not recognized", |
| 286 |
|
"number is too big" |
| 287 |
}; |
}; |
| 288 |
|
|
| 289 |
|
|
| 440 |
|
|
| 441 |
Returns: zero or positive => a data character |
Returns: zero or positive => a data character |
| 442 |
negative => a special escape sequence |
negative => a special escape sequence |
| 443 |
on error, errorptr is set |
on error, errorcodeptr is set |
| 444 |
*/ |
*/ |
| 445 |
|
|
| 446 |
static int |
static int |
| 525 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 526 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 527 |
|
|
| 528 |
|
if (c < 0) |
| 529 |
|
{ |
| 530 |
|
*errorcodeptr = ERR61; |
| 531 |
|
break; |
| 532 |
|
} |
| 533 |
|
|
| 534 |
if (c == 0 || (braced && *(++ptr) != '}')) |
if (c == 0 || (braced && *(++ptr) != '}')) |
| 535 |
{ |
{ |
| 536 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 537 |
return 0; |
break; |
| 538 |
} |
} |
| 539 |
|
|
| 540 |
if (negated) |
if (negated) |
| 542 |
if (c > bracount) |
if (c > bracount) |
| 543 |
{ |
{ |
| 544 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 545 |
return 0; |
break; |
| 546 |
} |
} |
| 547 |
c = bracount - (c - 1); |
c = bracount - (c - 1); |
| 548 |
} |
} |
| 571 |
c -= '0'; |
c -= '0'; |
| 572 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 573 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 574 |
|
if (c < 0) |
| 575 |
|
{ |
| 576 |
|
*errorcodeptr = ERR61; |
| 577 |
|
break; |
| 578 |
|
} |
| 579 |
if (c < 10 || c <= bracount) |
if (c < 10 || c <= bracount) |
| 580 |
{ |
{ |
| 581 |
c = -(ESC_REF + c); |
c = -(ESC_REF + c); |
| 671 |
if (c == 0) |
if (c == 0) |
| 672 |
{ |
{ |
| 673 |
*errorcodeptr = ERR2; |
*errorcodeptr = ERR2; |
| 674 |
return 0; |
break; |
| 675 |
} |
} |
| 676 |
|
|
| 677 |
#ifndef EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 747 |
*negptr = TRUE; |
*negptr = TRUE; |
| 748 |
ptr++; |
ptr++; |
| 749 |
} |
} |
| 750 |
for (i = 0; i < sizeof(name) - 1; i++) |
for (i = 0; i < (int)sizeof(name) - 1; i++) |
| 751 |
{ |
{ |
| 752 |
c = *(++ptr); |
c = *(++ptr); |
| 753 |
if (c == 0) goto ERROR_RETURN; |
if (c == 0) goto ERROR_RETURN; |
| 950 |
{ |
{ |
| 951 |
while (*(++ptr) != ']') |
while (*(++ptr) != ']') |
| 952 |
{ |
{ |
| 953 |
|
if (*ptr == 0) return -1; |
| 954 |
if (*ptr == '\\') |
if (*ptr == '\\') |
| 955 |
{ |
{ |
| 956 |
if (*(++ptr) == 0) return -1; |
if (*(++ptr) == 0) return -1; |
| 978 |
/* An opening parens must now be a real metacharacter */ |
/* An opening parens must now be a real metacharacter */ |
| 979 |
|
|
| 980 |
if (*ptr != '(') continue; |
if (*ptr != '(') continue; |
| 981 |
if (ptr[1] != '?') |
if (ptr[1] != '?' && ptr[1] != '*') |
| 982 |
{ |
{ |
| 983 |
count++; |
count++; |
| 984 |
if (name == NULL && count == lorn) return count; |
if (name == NULL && count == lorn) return count; |
| 1106 |
{ |
{ |
| 1107 |
int d; |
int d; |
| 1108 |
register int op = *cc; |
register int op = *cc; |
|
|
|
| 1109 |
switch (op) |
switch (op) |
| 1110 |
{ |
{ |
| 1111 |
case OP_CBRA: |
case OP_CBRA: |
| 1194 |
|
|
| 1195 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
| 1196 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
| 1197 |
|
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; |
| 1198 |
cc += 4; |
cc += 4; |
| 1199 |
break; |
break; |
| 1200 |
|
|
| 1303 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1304 |
} |
} |
| 1305 |
|
|
| 1306 |
/* In UTF-8 mode, opcodes that are followed by a character may be followed by |
/* Otherwise, we can get the item's length from the table, except that for |
| 1307 |
a multi-byte character. The length in the table is a minimum, so we have to |
repeated character types, we have to test for \p and \P, which have an extra |
| 1308 |
arrange to skip the extra bytes. */ |
two bytes of parameters. */ |
| 1309 |
|
|
| 1310 |
else |
else |
| 1311 |
{ |
{ |
| 1312 |
|
switch(c) |
| 1313 |
|
{ |
| 1314 |
|
case OP_TYPESTAR: |
| 1315 |
|
case OP_TYPEMINSTAR: |
| 1316 |
|
case OP_TYPEPLUS: |
| 1317 |
|
case OP_TYPEMINPLUS: |
| 1318 |
|
case OP_TYPEQUERY: |
| 1319 |
|
case OP_TYPEMINQUERY: |
| 1320 |
|
case OP_TYPEPOSSTAR: |
| 1321 |
|
case OP_TYPEPOSPLUS: |
| 1322 |
|
case OP_TYPEPOSQUERY: |
| 1323 |
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 1324 |
|
break; |
| 1325 |
|
|
| 1326 |
|
case OP_TYPEUPTO: |
| 1327 |
|
case OP_TYPEMINUPTO: |
| 1328 |
|
case OP_TYPEEXACT: |
| 1329 |
|
case OP_TYPEPOSUPTO: |
| 1330 |
|
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
| 1331 |
|
break; |
| 1332 |
|
} |
| 1333 |
|
|
| 1334 |
|
/* Add in the fixed length from the table */ |
| 1335 |
|
|
| 1336 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1337 |
|
|
| 1338 |
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed by |
| 1339 |
|
a multi-byte character. The length in the table is a minimum, so we have to |
| 1340 |
|
arrange to skip the extra bytes. */ |
| 1341 |
|
|
| 1342 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1343 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1344 |
{ |
{ |
| 1396 |
|
|
| 1397 |
if (c == OP_XCLASS) code += GET(code, 1); |
if (c == OP_XCLASS) code += GET(code, 1); |
| 1398 |
|
|
| 1399 |
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
/* Otherwise, we can get the item's length from the table, except that for |
| 1400 |
that are followed by a character may be followed by a multi-byte character. |
repeated character types, we have to test for \p and \P, which have an extra |
| 1401 |
The length in the table is a minimum, so we have to arrange to skip the extra |
two bytes of parameters. */ |
|
bytes. */ |
|
| 1402 |
|
|
| 1403 |
else |
else |
| 1404 |
{ |
{ |
| 1405 |
|
switch(c) |
| 1406 |
|
{ |
| 1407 |
|
case OP_TYPESTAR: |
| 1408 |
|
case OP_TYPEMINSTAR: |
| 1409 |
|
case OP_TYPEPLUS: |
| 1410 |
|
case OP_TYPEMINPLUS: |
| 1411 |
|
case OP_TYPEQUERY: |
| 1412 |
|
case OP_TYPEMINQUERY: |
| 1413 |
|
case OP_TYPEPOSSTAR: |
| 1414 |
|
case OP_TYPEPOSPLUS: |
| 1415 |
|
case OP_TYPEPOSQUERY: |
| 1416 |
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 1417 |
|
break; |
| 1418 |
|
|
| 1419 |
|
case OP_TYPEPOSUPTO: |
| 1420 |
|
case OP_TYPEUPTO: |
| 1421 |
|
case OP_TYPEMINUPTO: |
| 1422 |
|
case OP_TYPEEXACT: |
| 1423 |
|
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
| 1424 |
|
break; |
| 1425 |
|
} |
| 1426 |
|
|
| 1427 |
|
/* Add in the fixed length from the table */ |
| 1428 |
|
|
| 1429 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1430 |
|
|
| 1431 |
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
| 1432 |
|
by a multi-byte character. The length in the table is a minimum, so we have |
| 1433 |
|
to arrange to skip the extra bytes. */ |
| 1434 |
|
|
| 1435 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1436 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1437 |
{ |
{ |
| 1503 |
|
|
| 1504 |
/* For other groups, scan the branches. */ |
/* For other groups, scan the branches. */ |
| 1505 |
|
|
| 1506 |
if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE) |
if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) |
| 1507 |
{ |
{ |
| 1508 |
BOOL empty_branch; |
BOOL empty_branch; |
| 1509 |
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
| 1527 |
|
|
| 1528 |
switch (c) |
switch (c) |
| 1529 |
{ |
{ |
| 1530 |
/* Check for quantifiers after a class */ |
/* Check for quantifiers after a class. XCLASS is used for classes that |
| 1531 |
|
cannot be represented just by a bit map. This includes negated single |
| 1532 |
|
high-valued characters. The length in _pcre_OP_lengths[] is zero; the |
| 1533 |
|
actual length is stored in the compiled code, so we must update "code" |
| 1534 |
|
here. */ |
| 1535 |
|
|
| 1536 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1537 |
case OP_XCLASS: |
case OP_XCLASS: |
| 1538 |
ccode = code + GET(code, 1); |
ccode = code += GET(code, 1); |
| 1539 |
goto CHECK_CLASS_REPEAT; |
goto CHECK_CLASS_REPEAT; |
| 1540 |
#endif |
#endif |
| 1541 |
|
|
| 1596 |
case OP_TYPEPOSPLUS: |
case OP_TYPEPOSPLUS: |
| 1597 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
| 1598 |
return FALSE; |
return FALSE; |
| 1599 |
|
|
| 1600 |
|
/* These are going to continue, as they may be empty, but we have to |
| 1601 |
|
fudge the length for the \p and \P cases. */ |
| 1602 |
|
|
| 1603 |
|
case OP_TYPESTAR: |
| 1604 |
|
case OP_TYPEMINSTAR: |
| 1605 |
|
case OP_TYPEPOSSTAR: |
| 1606 |
|
case OP_TYPEQUERY: |
| 1607 |
|
case OP_TYPEMINQUERY: |
| 1608 |
|
case OP_TYPEPOSQUERY: |
| 1609 |
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| 1610 |
|
break; |
| 1611 |
|
|
| 1612 |
|
/* Same for these */ |
| 1613 |
|
|
| 1614 |
|
case OP_TYPEUPTO: |
| 1615 |
|
case OP_TYPEMINUPTO: |
| 1616 |
|
case OP_TYPEPOSUPTO: |
| 1617 |
|
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
| 1618 |
|
break; |
| 1619 |
|
|
| 1620 |
/* End of branch */ |
/* End of branch */ |
| 1621 |
|
|
| 1779 |
uschar *save_hwm) |
uschar *save_hwm) |
| 1780 |
{ |
{ |
| 1781 |
uschar *ptr = group; |
uschar *ptr = group; |
| 1782 |
|
|
| 1783 |
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) |
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) |
| 1784 |
{ |
{ |
| 1785 |
int offset; |
int offset; |
| 2384 |
*/ |
*/ |
| 2385 |
|
|
| 2386 |
if (code < last_code) code = last_code; |
if (code < last_code) code = last_code; |
| 2387 |
|
|
| 2388 |
|
/* Paranoid check for integer overflow */ |
| 2389 |
|
|
| 2390 |
|
if (OFLOW_MAX - *lengthptr < code - last_code) |
| 2391 |
|
{ |
| 2392 |
|
*errorcodeptr = ERR20; |
| 2393 |
|
goto FAILED; |
| 2394 |
|
} |
| 2395 |
|
|
| 2396 |
*lengthptr += code - last_code; |
*lengthptr += code - last_code; |
| 2397 |
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); |
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); |
| 2398 |
|
|
| 2505 |
*ptrptr = ptr; |
*ptrptr = ptr; |
| 2506 |
if (lengthptr != NULL) |
if (lengthptr != NULL) |
| 2507 |
{ |
{ |
| 2508 |
|
if (OFLOW_MAX - *lengthptr < code - last_code) |
| 2509 |
|
{ |
| 2510 |
|
*errorcodeptr = ERR20; |
| 2511 |
|
goto FAILED; |
| 2512 |
|
} |
| 2513 |
*lengthptr += code - last_code; /* To include callout length */ |
*lengthptr += code - last_code; /* To include callout length */ |
| 2514 |
DPRINTF((">> end branch\n")); |
DPRINTF((">> end branch\n")); |
| 2515 |
} |
} |
| 2572 |
goto FAILED; |
goto FAILED; |
| 2573 |
} |
} |
| 2574 |
|
|
| 2575 |
/* If the first character is '^', set the negation flag and skip it. */ |
/* If the first character is '^', set the negation flag and skip it. Also, |
| 2576 |
|
if the first few characters (either before or after ^) are \Q\E or \E we |
| 2577 |
|
skip them too. This makes for compatibility with Perl. */ |
| 2578 |
|
|
| 2579 |
if ((c = *(++ptr)) == '^') |
negate_class = FALSE; |
| 2580 |
|
for (;;) |
| 2581 |
{ |
{ |
|
negate_class = TRUE; |
|
| 2582 |
c = *(++ptr); |
c = *(++ptr); |
| 2583 |
} |
if (c == '\\') |
| 2584 |
else |
{ |
| 2585 |
{ |
if (ptr[1] == 'E') ptr++; |
| 2586 |
negate_class = FALSE; |
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
| 2587 |
|
else break; |
| 2588 |
|
} |
| 2589 |
|
else if (!negate_class && c == '^') |
| 2590 |
|
negate_class = TRUE; |
| 2591 |
|
else break; |
| 2592 |
} |
} |
| 2593 |
|
|
| 2594 |
/* Keep a count of chars with values < 256 so that we can optimize the case |
/* Keep a count of chars with values < 256 so that we can optimize the case |
| 2729 |
of the specials, which just set a flag. The sequence \b is a special |
of the specials, which just set a flag. The sequence \b is a special |
| 2730 |
case. Inside a class (and only there) it is treated as backspace. |
case. Inside a class (and only there) it is treated as backspace. |
| 2731 |
Elsewhere it marks a word boundary. Other escapes have preset maps ready |
Elsewhere it marks a word boundary. Other escapes have preset maps ready |
| 2732 |
to or into the one we are building. We assume they have more than one |
to 'or' into the one we are building. We assume they have more than one |
| 2733 |
character in them, so set class_charcount bigger than one. */ |
character in them, so set class_charcount bigger than one. */ |
| 2734 |
|
|
| 2735 |
if (c == '\\') |
if (c == '\\') |
| 2749 |
else inescq = TRUE; |
else inescq = TRUE; |
| 2750 |
continue; |
continue; |
| 2751 |
} |
} |
| 2752 |
|
else if (-c == ESC_E) continue; /* Ignore orphan \E */ |
| 2753 |
|
|
| 2754 |
if (c < 0) |
if (c < 0) |
| 2755 |
{ |
{ |
| 3197 |
} |
} |
| 3198 |
|
|
| 3199 |
/* If class_charcount is 1, we saw precisely one character whose value is |
/* If class_charcount is 1, we saw precisely one character whose value is |
| 3200 |
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we |
less than 256. As long as there were no characters >= 128 and there was no |
| 3201 |
can optimize the negative case only if there were no characters >= 128 |
use of \p or \P, in other words, no use of any XCLASS features, we can |
| 3202 |
because OP_NOT and the related opcodes like OP_NOTSTAR operate on |
optimize. |
| 3203 |
single-bytes only. This is an historical hangover. Maybe one day we can |
|
| 3204 |
tidy these opcodes to handle multi-byte characters. |
In UTF-8 mode, we can optimize the negative case only if there were no |
| 3205 |
|
characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR |
| 3206 |
|
operate on single-bytes only. This is an historical hangover. Maybe one day |
| 3207 |
|
we can tidy these opcodes to handle multi-byte characters. |
| 3208 |
|
|
| 3209 |
The optimization throws away the bit map. We turn the item into a |
The optimization throws away the bit map. We turn the item into a |
| 3210 |
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note |
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note |
| 3214 |
reqbyte, save the previous value for reinstating. */ |
reqbyte, save the previous value for reinstating. */ |
| 3215 |
|
|
| 3216 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 3217 |
if (class_charcount == 1 && |
if (class_charcount == 1 && !class_utf8 && |
| 3218 |
(!utf8 || |
(!utf8 || !negate_class || class_lastchar < 128)) |
|
(!class_utf8 && (!negate_class || class_lastchar < 128)))) |
|
|
|
|
| 3219 |
#else |
#else |
| 3220 |
if (class_charcount == 1) |
if (class_charcount == 1) |
| 3221 |
#endif |
#endif |
| 3673 |
goto FAILED; |
goto FAILED; |
| 3674 |
} |
} |
| 3675 |
|
|
|
/* This is a paranoid check to stop integer overflow later on */ |
|
|
|
|
|
if (len > MAX_DUPLENGTH) |
|
|
{ |
|
|
*errorcodeptr = ERR50; |
|
|
goto FAILED; |
|
|
} |
|
|
|
|
| 3676 |
/* If the maximum repeat count is unlimited, find the end of the bracket |
/* If the maximum repeat count is unlimited, find the end of the bracket |
| 3677 |
by scanning through from the start, and compute the offset back to it |
by scanning through from the start, and compute the offset back to it |
| 3678 |
from the current code pointer. There may be an OP_OPT setting following |
from the current code pointer. There may be an OP_OPT setting following |
| 3761 |
if (repeat_min > 1) |
if (repeat_min > 1) |
| 3762 |
{ |
{ |
| 3763 |
/* In the pre-compile phase, we don't actually do the replication. We |
/* In the pre-compile phase, we don't actually do the replication. We |
| 3764 |
just adjust the length as if we had. */ |
just adjust the length as if we had. Do some paranoid checks for |
| 3765 |
|
potential integer overflow. */ |
| 3766 |
|
|
| 3767 |
if (lengthptr != NULL) |
if (lengthptr != NULL) |
| 3768 |
*lengthptr += (repeat_min - 1)*length_prevgroup; |
{ |
| 3769 |
|
int delta = (repeat_min - 1)*length_prevgroup; |
| 3770 |
|
if ((double)(repeat_min - 1)*(double)length_prevgroup > |
| 3771 |
|
(double)INT_MAX || |
| 3772 |
|
OFLOW_MAX - *lengthptr < delta) |
| 3773 |
|
{ |
| 3774 |
|
*errorcodeptr = ERR20; |
| 3775 |
|
goto FAILED; |
| 3776 |
|
} |
| 3777 |
|
*lengthptr += delta; |
| 3778 |
|
} |
| 3779 |
|
|
| 3780 |
/* This is compiling for real */ |
/* This is compiling for real */ |
| 3781 |
|
|
| 3813 |
/* In the pre-compile phase, we don't actually do the replication. We |
/* In the pre-compile phase, we don't actually do the replication. We |
| 3814 |
just adjust the length as if we had. For each repetition we must add 1 |
just adjust the length as if we had. For each repetition we must add 1 |
| 3815 |
to the length for BRAZERO and for all but the last repetition we must |
to the length for BRAZERO and for all but the last repetition we must |
| 3816 |
add 2 + 2*LINKSIZE to allow for the nesting that occurs. */ |
add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some |
| 3817 |
|
paranoid checks to avoid integer overflow. */ |
| 3818 |
|
|
| 3819 |
if (lengthptr != NULL && repeat_max > 0) |
if (lengthptr != NULL && repeat_max > 0) |
| 3820 |
*lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - |
{ |
| 3821 |
2 - 2*LINK_SIZE; /* Last one doesn't nest */ |
int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - |
| 3822 |
|
2 - 2*LINK_SIZE; /* Last one doesn't nest */ |
| 3823 |
|
if ((double)repeat_max * |
| 3824 |
|
(double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) |
| 3825 |
|
> (double)INT_MAX || |
| 3826 |
|
OFLOW_MAX - *lengthptr < delta) |
| 3827 |
|
{ |
| 3828 |
|
*errorcodeptr = ERR20; |
| 3829 |
|
goto FAILED; |
| 3830 |
|
} |
| 3831 |
|
*lengthptr += delta; |
| 3832 |
|
} |
| 3833 |
|
|
| 3834 |
/* This is compiling for real */ |
/* This is compiling for real */ |
| 3835 |
|
|
| 3981 |
/* ===================================================================*/ |
/* ===================================================================*/ |
| 3982 |
/* Start of nested parenthesized sub-expression, or comment or lookahead or |
/* Start of nested parenthesized sub-expression, or comment or lookahead or |
| 3983 |
lookbehind or option setting or condition or all the other extended |
lookbehind or option setting or condition or all the other extended |
| 3984 |
parenthesis forms. First deal with the specials; all are introduced by ?, |
parenthesis forms. */ |
|
and the appearance of any of them means that this is not a capturing |
|
|
group. */ |
|
| 3985 |
|
|
| 3986 |
case '(': |
case '(': |
| 3987 |
newoptions = options; |
newoptions = options; |
| 3990 |
save_hwm = cd->hwm; |
save_hwm = cd->hwm; |
| 3991 |
reset_bracount = FALSE; |
reset_bracount = FALSE; |
| 3992 |
|
|
| 3993 |
if (*(++ptr) == '?') |
/* First deal with various "verbs" that can be introduced by '*'. */ |
| 3994 |
|
|
| 3995 |
|
if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) |
| 3996 |
|
{ |
| 3997 |
|
int i, namelen; |
| 3998 |
|
const uschar *name = ++ptr; |
| 3999 |
|
previous = NULL; |
| 4000 |
|
while ((cd->ctypes[*++ptr] & ctype_letter) != 0); |
| 4001 |
|
if (*ptr == ':') |
| 4002 |
|
{ |
| 4003 |
|
*errorcodeptr = ERR59; /* Not supported */ |
| 4004 |
|
goto FAILED; |
| 4005 |
|
} |
| 4006 |
|
if (*ptr != ')') |
| 4007 |
|
{ |
| 4008 |
|
*errorcodeptr = ERR60; |
| 4009 |
|
goto FAILED; |
| 4010 |
|
} |
| 4011 |
|
namelen = ptr - name; |
| 4012 |
|
for (i = 0; i < verbcount; i++) |
| 4013 |
|
{ |
| 4014 |
|
if (namelen == verbs[i].len && |
| 4015 |
|
strncmp((char *)name, verbs[i].name, namelen) == 0) |
| 4016 |
|
{ |
| 4017 |
|
*code = verbs[i].op; |
| 4018 |
|
if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; |
| 4019 |
|
break; |
| 4020 |
|
} |
| 4021 |
|
} |
| 4022 |
|
if (i < verbcount) continue; |
| 4023 |
|
*errorcodeptr = ERR60; |
| 4024 |
|
goto FAILED; |
| 4025 |
|
} |
| 4026 |
|
|
| 4027 |
|
/* Deal with the extended parentheses; all are introduced by '?', and the |
| 4028 |
|
appearance of any of them means that this is not a capturing group. */ |
| 4029 |
|
|
| 4030 |
|
else if (*ptr == '?') |
| 4031 |
{ |
{ |
| 4032 |
int i, set, unset, namelen; |
int i, set, unset, namelen; |
| 4033 |
int *optset; |
int *optset; |
| 4269 |
|
|
| 4270 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
| 4271 |
case '!': /* Negative lookahead */ |
case '!': /* Negative lookahead */ |
|
bravalue = OP_ASSERT_NOT; |
|
| 4272 |
ptr++; |
ptr++; |
| 4273 |
|
if (*ptr == ')') /* Optimize (?!) */ |
| 4274 |
|
{ |
| 4275 |
|
*code++ = OP_FAIL; |
| 4276 |
|
previous = NULL; |
| 4277 |
|
continue; |
| 4278 |
|
} |
| 4279 |
|
bravalue = OP_ASSERT_NOT; |
| 4280 |
break; |
break; |
| 4281 |
|
|
| 4282 |
|
|
| 4825 |
goto FAILED; |
goto FAILED; |
| 4826 |
} |
} |
| 4827 |
|
|
| 4828 |
/* In the pre-compile phase, update the length by the length of the nested |
/* In the pre-compile phase, update the length by the length of the group, |
| 4829 |
group, less the brackets at either end. Then reduce the compiled code to |
less the brackets at either end. Then reduce the compiled code to just a |
| 4830 |
just the brackets so that it doesn't use much memory if it is duplicated by |
set of non-capturing brackets so that it doesn't use much memory if it is |
| 4831 |
a quantifier. */ |
duplicated by a quantifier.*/ |
| 4832 |
|
|
| 4833 |
if (lengthptr != NULL) |
if (lengthptr != NULL) |
| 4834 |
{ |
{ |
| 4835 |
|
if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) |
| 4836 |
|
{ |
| 4837 |
|
*errorcodeptr = ERR20; |
| 4838 |
|
goto FAILED; |
| 4839 |
|
} |
| 4840 |
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
| 4841 |
code++; |
*code++ = OP_BRA; |
| 4842 |
PUTINC(code, 0, 1 + LINK_SIZE); |
PUTINC(code, 0, 1 + LINK_SIZE); |
| 4843 |
*code++ = OP_KET; |
*code++ = OP_KET; |
| 4844 |
PUTINC(code, 0, 1 + LINK_SIZE); |
PUTINC(code, 0, 1 + LINK_SIZE); |
| 4845 |
|
break; /* No need to waste time with special character handling */ |
| 4846 |
} |
} |
| 4847 |
|
|
| 4848 |
/* Otherwise update the main code pointer to the end of the group. */ |
/* Otherwise update the main code pointer to the end of the group. */ |
| 4849 |
|
|
| 4850 |
else code = tempcode; |
code = tempcode; |
| 4851 |
|
|
| 4852 |
/* For a DEFINE group, required and first character settings are not |
/* For a DEFINE group, required and first character settings are not |
| 4853 |
relevant. */ |
relevant. */ |
| 5333 |
*ptrptr = ptr; |
*ptrptr = ptr; |
| 5334 |
*firstbyteptr = firstbyte; |
*firstbyteptr = firstbyte; |
| 5335 |
*reqbyteptr = reqbyte; |
*reqbyteptr = reqbyte; |
| 5336 |
if (lengthptr != NULL) *lengthptr += length; |
if (lengthptr != NULL) |
| 5337 |
|
{ |
| 5338 |
|
if (OFLOW_MAX - *lengthptr < length) |
| 5339 |
|
{ |
| 5340 |
|
*errorcodeptr = ERR20; |
| 5341 |
|
return FALSE; |
| 5342 |
|
} |
| 5343 |
|
*lengthptr += length; |
| 5344 |
|
} |
| 5345 |
return TRUE; |
return TRUE; |
| 5346 |
} |
} |
| 5347 |
|
|
| 5869 |
cd->hwm = cworkspace; |
cd->hwm = cworkspace; |
| 5870 |
cd->req_varyopt = 0; |
cd->req_varyopt = 0; |
| 5871 |
cd->nopartial = FALSE; |
cd->nopartial = FALSE; |
| 5872 |
|
cd->had_accept = FALSE; |
| 5873 |
|
|
| 5874 |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
| 5875 |
error, errorcode will be set non-zero, so we don't need to look at the result |
error, errorcode will be set non-zero, so we don't need to look at the result |
| 5884 |
re->top_backref = cd->top_backref; |
re->top_backref = cd->top_backref; |
| 5885 |
|
|
| 5886 |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
| 5887 |
|
if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ |
| 5888 |
|
|
| 5889 |
/* If not reached end of pattern on success, there's an excess bracket. */ |
/* If not reached end of pattern on success, there's an excess bracket. */ |
| 5890 |
|
|