| 410 |
"this version of PCRE is not compiled with PCRE_UCP support\0" |
"this version of PCRE is not compiled with PCRE_UCP support\0" |
| 411 |
"\\c must be followed by an ASCII character\0" |
"\\c must be followed by an ASCII character\0" |
| 412 |
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
| 413 |
|
/* 70 */ |
| 414 |
|
"internal error: unknown opcode in find_fixedlength()\0" |
| 415 |
; |
; |
| 416 |
|
|
| 417 |
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
| 678 |
|
|
| 679 |
case CHAR_l: |
case CHAR_l: |
| 680 |
case CHAR_L: |
case CHAR_L: |
| 681 |
|
*errorcodeptr = ERR37; |
| 682 |
|
break; |
| 683 |
|
|
| 684 |
case CHAR_u: |
case CHAR_u: |
| 685 |
|
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| 686 |
|
{ |
| 687 |
|
/* In JavaScript, \u must be followed by four hexadecimal numbers. |
| 688 |
|
Otherwise it is a lowercase u letter. */ |
| 689 |
|
if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0 |
| 690 |
|
&& (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0) |
| 691 |
|
{ |
| 692 |
|
c = 0; |
| 693 |
|
for (i = 0; i < 4; ++i) |
| 694 |
|
{ |
| 695 |
|
register int cc = *(++ptr); |
| 696 |
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| 697 |
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| 698 |
|
c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| 699 |
|
#else /* EBCDIC coding */ |
| 700 |
|
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| 701 |
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| 702 |
|
#endif |
| 703 |
|
} |
| 704 |
|
} |
| 705 |
|
} |
| 706 |
|
else |
| 707 |
|
*errorcodeptr = ERR37; |
| 708 |
|
break; |
| 709 |
|
|
| 710 |
case CHAR_U: |
case CHAR_U: |
| 711 |
*errorcodeptr = ERR37; |
/* In JavaScript, \U is an uppercase U letter. */ |
| 712 |
|
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37; |
| 713 |
break; |
break; |
| 714 |
|
|
| 715 |
/* In a character class, \g is just a literal "g". Outside a character |
/* In a character class, \g is just a literal "g". Outside a character |
| 859 |
treated as a data character. */ |
treated as a data character. */ |
| 860 |
|
|
| 861 |
case CHAR_x: |
case CHAR_x: |
| 862 |
|
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| 863 |
|
{ |
| 864 |
|
/* In JavaScript, \x must be followed by two hexadecimal numbers. |
| 865 |
|
Otherwise it is a lowercase x letter. */ |
| 866 |
|
if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0) |
| 867 |
|
{ |
| 868 |
|
c = 0; |
| 869 |
|
for (i = 0; i < 2; ++i) |
| 870 |
|
{ |
| 871 |
|
register int cc = *(++ptr); |
| 872 |
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| 873 |
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| 874 |
|
c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| 875 |
|
#else /* EBCDIC coding */ |
| 876 |
|
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| 877 |
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| 878 |
|
#endif |
| 879 |
|
} |
| 880 |
|
} |
| 881 |
|
break; |
| 882 |
|
} |
| 883 |
|
|
| 884 |
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| 885 |
{ |
{ |
| 886 |
const uschar *pt = ptr + 2; |
const uschar *pt = ptr + 2; |
| 1528 |
|
|
| 1529 |
Returns: the fixed length, |
Returns: the fixed length, |
| 1530 |
or -1 if there is no fixed length, |
or -1 if there is no fixed length, |
| 1531 |
or -2 if \C was encountered |
or -2 if \C was encountered (in UTF-8 mode only) |
| 1532 |
or -3 if an OP_RECURSE item was encountered and atend is FALSE |
or -3 if an OP_RECURSE item was encountered and atend is FALSE |
| 1533 |
|
or -4 if an unknown opcode was encountered (internal error) |
| 1534 |
*/ |
*/ |
| 1535 |
|
|
| 1536 |
static int |
static int |
| 1554 |
/* We only need to continue for OP_CBRA (normal capturing bracket) and |
/* We only need to continue for OP_CBRA (normal capturing bracket) and |
| 1555 |
OP_BRA (normal non-capturing bracket) because the other variants of these |
OP_BRA (normal non-capturing bracket) because the other variants of these |
| 1556 |
opcodes are all concerned with unlimited repeated groups, which of course |
opcodes are all concerned with unlimited repeated groups, which of course |
| 1557 |
are not of fixed length. They will cause a -1 response from the default |
are not of fixed length. */ |
|
case of this switch. */ |
|
| 1558 |
|
|
| 1559 |
case OP_CBRA: |
case OP_CBRA: |
| 1560 |
case OP_BRA: |
case OP_BRA: |
| 1561 |
case OP_ONCE: |
case OP_ONCE: |
| 1562 |
|
case OP_ONCE_NC: |
| 1563 |
case OP_COND: |
case OP_COND: |
| 1564 |
d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd); |
d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd); |
| 1565 |
if (d < 0) return d; |
if (d < 0) return d; |
| 1568 |
cc += 1 + LINK_SIZE; |
cc += 1 + LINK_SIZE; |
| 1569 |
break; |
break; |
| 1570 |
|
|
| 1571 |
/* Reached end of a branch; if it's a ket it is the end of a nested |
/* Reached end of a branch; if it's a ket it is the end of a nested call. |
| 1572 |
call. If it's ALT it is an alternation in a nested call. If it is |
If it's ALT it is an alternation in a nested call. An ACCEPT is effectively |
| 1573 |
END it's the end of the outer call. All can be handled by the same code. |
an ALT. If it is END it's the end of the outer call. All can be handled by |
| 1574 |
Note that we must not include the OP_KETRxxx opcodes here, because they |
the same code. Note that we must not include the OP_KETRxxx opcodes here, |
| 1575 |
all imply an unlimited repeat. */ |
because they all imply an unlimited repeat. */ |
| 1576 |
|
|
| 1577 |
case OP_ALT: |
case OP_ALT: |
| 1578 |
case OP_KET: |
case OP_KET: |
| 1579 |
case OP_END: |
case OP_END: |
| 1580 |
|
case OP_ACCEPT: |
| 1581 |
|
case OP_ASSERT_ACCEPT: |
| 1582 |
if (length < 0) length = branchlength; |
if (length < 0) length = branchlength; |
| 1583 |
else if (length != branchlength) return -1; |
else if (length != branchlength) return -1; |
| 1584 |
if (*cc != OP_ALT) return length; |
if (*cc != OP_ALT) return length; |
| 1612 |
|
|
| 1613 |
/* Skip over things that don't match chars */ |
/* Skip over things that don't match chars */ |
| 1614 |
|
|
| 1615 |
case OP_REVERSE: |
case OP_MARK: |
| 1616 |
case OP_CREF: |
case OP_PRUNE_ARG: |
| 1617 |
case OP_NCREF: |
case OP_SKIP_ARG: |
| 1618 |
case OP_RREF: |
case OP_THEN_ARG: |
| 1619 |
case OP_NRREF: |
cc += cc[1] + _pcre_OP_lengths[*cc]; |
| 1620 |
case OP_DEF: |
break; |
| 1621 |
|
|
| 1622 |
case OP_CALLOUT: |
case OP_CALLOUT: |
|
case OP_SOD: |
|
|
case OP_SOM: |
|
|
case OP_SET_SOM: |
|
|
case OP_EOD: |
|
|
case OP_EODN: |
|
| 1623 |
case OP_CIRC: |
case OP_CIRC: |
| 1624 |
case OP_CIRCM: |
case OP_CIRCM: |
| 1625 |
|
case OP_CLOSE: |
| 1626 |
|
case OP_COMMIT: |
| 1627 |
|
case OP_CREF: |
| 1628 |
|
case OP_DEF: |
| 1629 |
case OP_DOLL: |
case OP_DOLL: |
| 1630 |
case OP_DOLLM: |
case OP_DOLLM: |
| 1631 |
|
case OP_EOD: |
| 1632 |
|
case OP_EODN: |
| 1633 |
|
case OP_FAIL: |
| 1634 |
|
case OP_NCREF: |
| 1635 |
|
case OP_NRREF: |
| 1636 |
case OP_NOT_WORD_BOUNDARY: |
case OP_NOT_WORD_BOUNDARY: |
| 1637 |
|
case OP_PRUNE: |
| 1638 |
|
case OP_REVERSE: |
| 1639 |
|
case OP_RREF: |
| 1640 |
|
case OP_SET_SOM: |
| 1641 |
|
case OP_SKIP: |
| 1642 |
|
case OP_SOD: |
| 1643 |
|
case OP_SOM: |
| 1644 |
|
case OP_THEN: |
| 1645 |
case OP_WORD_BOUNDARY: |
case OP_WORD_BOUNDARY: |
| 1646 |
cc += _pcre_OP_lengths[*cc]; |
cc += _pcre_OP_lengths[*cc]; |
| 1647 |
break; |
break; |
| 1663 |
need to skip over a multibyte character in UTF8 mode. */ |
need to skip over a multibyte character in UTF8 mode. */ |
| 1664 |
|
|
| 1665 |
case OP_EXACT: |
case OP_EXACT: |
| 1666 |
|
case OP_EXACTI: |
| 1667 |
|
case OP_NOTEXACT: |
| 1668 |
|
case OP_NOTEXACTI: |
| 1669 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
| 1670 |
cc += 4; |
cc += 4; |
| 1671 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1686 |
cc += 2; |
cc += 2; |
| 1687 |
/* Fall through */ |
/* Fall through */ |
| 1688 |
|
|
| 1689 |
|
case OP_HSPACE: |
| 1690 |
|
case OP_VSPACE: |
| 1691 |
|
case OP_NOT_HSPACE: |
| 1692 |
|
case OP_NOT_VSPACE: |
| 1693 |
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
| 1694 |
case OP_DIGIT: |
case OP_DIGIT: |
| 1695 |
case OP_NOT_WHITESPACE: |
case OP_NOT_WHITESPACE: |
| 1702 |
cc++; |
cc++; |
| 1703 |
break; |
break; |
| 1704 |
|
|
| 1705 |
/* The single-byte matcher isn't allowed */ |
/* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; |
| 1706 |
|
otherwise \C is coded as OP_ALLANY. */ |
| 1707 |
|
|
| 1708 |
case OP_ANYBYTE: |
case OP_ANYBYTE: |
| 1709 |
return -2; |
return -2; |
| 1722 |
|
|
| 1723 |
switch (*cc) |
switch (*cc) |
| 1724 |
{ |
{ |
| 1725 |
|
case OP_CRPLUS: |
| 1726 |
|
case OP_CRMINPLUS: |
| 1727 |
case OP_CRSTAR: |
case OP_CRSTAR: |
| 1728 |
case OP_CRMINSTAR: |
case OP_CRMINSTAR: |
| 1729 |
case OP_CRQUERY: |
case OP_CRQUERY: |
| 1744 |
|
|
| 1745 |
/* Anything else is variable length */ |
/* Anything else is variable length */ |
| 1746 |
|
|
| 1747 |
default: |
case OP_ANYNL: |
| 1748 |
|
case OP_BRAMINZERO: |
| 1749 |
|
case OP_BRAPOS: |
| 1750 |
|
case OP_BRAPOSZERO: |
| 1751 |
|
case OP_BRAZERO: |
| 1752 |
|
case OP_CBRAPOS: |
| 1753 |
|
case OP_EXTUNI: |
| 1754 |
|
case OP_KETRMAX: |
| 1755 |
|
case OP_KETRMIN: |
| 1756 |
|
case OP_KETRPOS: |
| 1757 |
|
case OP_MINPLUS: |
| 1758 |
|
case OP_MINPLUSI: |
| 1759 |
|
case OP_MINQUERY: |
| 1760 |
|
case OP_MINQUERYI: |
| 1761 |
|
case OP_MINSTAR: |
| 1762 |
|
case OP_MINSTARI: |
| 1763 |
|
case OP_MINUPTO: |
| 1764 |
|
case OP_MINUPTOI: |
| 1765 |
|
case OP_NOTMINPLUS: |
| 1766 |
|
case OP_NOTMINPLUSI: |
| 1767 |
|
case OP_NOTMINQUERY: |
| 1768 |
|
case OP_NOTMINQUERYI: |
| 1769 |
|
case OP_NOTMINSTAR: |
| 1770 |
|
case OP_NOTMINSTARI: |
| 1771 |
|
case OP_NOTMINUPTO: |
| 1772 |
|
case OP_NOTMINUPTOI: |
| 1773 |
|
case OP_NOTPLUS: |
| 1774 |
|
case OP_NOTPLUSI: |
| 1775 |
|
case OP_NOTPOSPLUS: |
| 1776 |
|
case OP_NOTPOSPLUSI: |
| 1777 |
|
case OP_NOTPOSQUERY: |
| 1778 |
|
case OP_NOTPOSQUERYI: |
| 1779 |
|
case OP_NOTPOSSTAR: |
| 1780 |
|
case OP_NOTPOSSTARI: |
| 1781 |
|
case OP_NOTPOSUPTO: |
| 1782 |
|
case OP_NOTPOSUPTOI: |
| 1783 |
|
case OP_NOTQUERY: |
| 1784 |
|
case OP_NOTQUERYI: |
| 1785 |
|
case OP_NOTSTAR: |
| 1786 |
|
case OP_NOTSTARI: |
| 1787 |
|
case OP_NOTUPTO: |
| 1788 |
|
case OP_NOTUPTOI: |
| 1789 |
|
case OP_PLUS: |
| 1790 |
|
case OP_PLUSI: |
| 1791 |
|
case OP_POSPLUS: |
| 1792 |
|
case OP_POSPLUSI: |
| 1793 |
|
case OP_POSQUERY: |
| 1794 |
|
case OP_POSQUERYI: |
| 1795 |
|
case OP_POSSTAR: |
| 1796 |
|
case OP_POSSTARI: |
| 1797 |
|
case OP_POSUPTO: |
| 1798 |
|
case OP_POSUPTOI: |
| 1799 |
|
case OP_QUERY: |
| 1800 |
|
case OP_QUERYI: |
| 1801 |
|
case OP_REF: |
| 1802 |
|
case OP_REFI: |
| 1803 |
|
case OP_SBRA: |
| 1804 |
|
case OP_SBRAPOS: |
| 1805 |
|
case OP_SCBRA: |
| 1806 |
|
case OP_SCBRAPOS: |
| 1807 |
|
case OP_SCOND: |
| 1808 |
|
case OP_SKIPZERO: |
| 1809 |
|
case OP_STAR: |
| 1810 |
|
case OP_STARI: |
| 1811 |
|
case OP_TYPEMINPLUS: |
| 1812 |
|
case OP_TYPEMINQUERY: |
| 1813 |
|
case OP_TYPEMINSTAR: |
| 1814 |
|
case OP_TYPEMINUPTO: |
| 1815 |
|
case OP_TYPEPLUS: |
| 1816 |
|
case OP_TYPEPOSPLUS: |
| 1817 |
|
case OP_TYPEPOSQUERY: |
| 1818 |
|
case OP_TYPEPOSSTAR: |
| 1819 |
|
case OP_TYPEPOSUPTO: |
| 1820 |
|
case OP_TYPEQUERY: |
| 1821 |
|
case OP_TYPESTAR: |
| 1822 |
|
case OP_TYPEUPTO: |
| 1823 |
|
case OP_UPTO: |
| 1824 |
|
case OP_UPTOI: |
| 1825 |
return -1; |
return -1; |
| 1826 |
|
|
| 1827 |
|
/* Catch unrecognized opcodes so that when new ones are added they |
| 1828 |
|
are not forgotten, as has happened in the past. */ |
| 1829 |
|
|
| 1830 |
|
default: |
| 1831 |
|
return -4; |
| 1832 |
} |
} |
| 1833 |
} |
} |
| 1834 |
/* Control never gets here */ |
/* Control never gets here */ |
| 1923 |
break; |
break; |
| 1924 |
|
|
| 1925 |
case OP_THEN_ARG: |
case OP_THEN_ARG: |
| 1926 |
code += code[1+LINK_SIZE]; |
code += code[1]; |
| 1927 |
break; |
break; |
| 1928 |
} |
} |
| 1929 |
|
|
| 2042 |
break; |
break; |
| 2043 |
|
|
| 2044 |
case OP_THEN_ARG: |
case OP_THEN_ARG: |
| 2045 |
code += code[1+LINK_SIZE]; |
code += code[1]; |
| 2046 |
break; |
break; |
| 2047 |
} |
} |
| 2048 |
|
|
| 2207 |
|
|
| 2208 |
if (c == OP_BRA || c == OP_BRAPOS || |
if (c == OP_BRA || c == OP_BRAPOS || |
| 2209 |
c == OP_CBRA || c == OP_CBRAPOS || |
c == OP_CBRA || c == OP_CBRAPOS || |
| 2210 |
c == OP_ONCE || c == OP_COND) |
c == OP_ONCE || c == OP_ONCE_NC || |
| 2211 |
|
c == OP_COND) |
| 2212 |
{ |
{ |
| 2213 |
BOOL empty_branch; |
BOOL empty_branch; |
| 2214 |
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
| 2380 |
break; |
break; |
| 2381 |
|
|
| 2382 |
case OP_THEN_ARG: |
case OP_THEN_ARG: |
| 2383 |
code += code[1+LINK_SIZE]; |
code += code[1]; |
| 2384 |
break; |
break; |
| 2385 |
|
|
| 2386 |
/* None of the remaining opcodes are required to match a character. */ |
/* None of the remaining opcodes are required to match a character. */ |
| 2458 |
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
| 2459 |
It seems that the appearance of a nested POSIX class supersedes an apparent |
It seems that the appearance of a nested POSIX class supersedes an apparent |
| 2460 |
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
| 2461 |
a digit. |
a digit. |
| 2462 |
|
|
| 2463 |
In Perl, unescaped square brackets may also appear as part of class names. For |
In Perl, unescaped square brackets may also appear as part of class names. For |
| 2464 |
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
| 2465 |
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
| 2466 |
seem right at all. PCRE does not allow closing square brackets in POSIX class |
seem right at all. PCRE does not allow closing square brackets in POSIX class |
| 2467 |
names. |
names. |
| 2468 |
|
|
| 2469 |
Arguments: |
Arguments: |
| 2482 |
{ |
{ |
| 2483 |
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| 2484 |
ptr++; |
ptr++; |
| 2485 |
else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
| 2486 |
else |
else |
| 2487 |
{ |
{ |
| 2488 |
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| 3255 |
uschar utf8_char[6]; |
uschar utf8_char[6]; |
| 3256 |
#else |
#else |
| 3257 |
BOOL utf8 = FALSE; |
BOOL utf8 = FALSE; |
|
uschar *utf8_char = NULL; |
|
| 3258 |
#endif |
#endif |
| 3259 |
|
|
| 3260 |
#ifdef PCRE_DEBUG |
#ifdef PCRE_DEBUG |
| 3305 |
int subfirstbyte; |
int subfirstbyte; |
| 3306 |
int terminator; |
int terminator; |
| 3307 |
int mclength; |
int mclength; |
| 3308 |
|
int tempbracount; |
| 3309 |
uschar mcbuffer[8]; |
uschar mcbuffer[8]; |
| 3310 |
|
|
| 3311 |
/* Get next byte in the pattern */ |
/* Get next byte in the pattern */ |
| 3353 |
} |
} |
| 3354 |
|
|
| 3355 |
*lengthptr += (int)(code - last_code); |
*lengthptr += (int)(code - last_code); |
| 3356 |
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); |
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code), |
| 3357 |
|
c)); |
| 3358 |
|
|
| 3359 |
/* If "previous" is set and it is not at the start of the work space, move |
/* If "previous" is set and it is not at the start of the work space, move |
| 3360 |
it back to there, in order to avoid filling up the work space. Otherwise, |
it back to there, in order to avoid filling up the work space. Otherwise, |
| 4426 |
past, but it no longer happens for non-repeated recursions. In fact, the |
past, but it no longer happens for non-repeated recursions. In fact, the |
| 4427 |
repeated ones could be re-implemented independently so as not to need this, |
repeated ones could be re-implemented independently so as not to need this, |
| 4428 |
but for the moment we rely on the code for repeating groups. */ |
but for the moment we rely on the code for repeating groups. */ |
| 4429 |
|
|
| 4430 |
if (*previous == OP_RECURSE) |
if (*previous == OP_RECURSE) |
| 4431 |
{ |
{ |
| 4432 |
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); |
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); |
| 4984 |
ONCE brackets can be converted into non-capturing brackets, as the |
ONCE brackets can be converted into non-capturing brackets, as the |
| 4985 |
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to |
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to |
| 4986 |
deal with possessive ONCEs specially. |
deal with possessive ONCEs specially. |
| 4987 |
|
|
| 4988 |
Otherwise, if the quantifier was possessive, we convert the BRA code to |
Otherwise, when we are doing the actual compile phase, check to see |
| 4989 |
the POS form, and the KET code to KETRPOS. (It turns out to be convenient |
whether this group is one that could match an empty string. If so, |
| 4990 |
at runtime to detect this kind of subpattern at both the start and at the |
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so |
| 4991 |
end.) The use of special opcodes makes it possible to reduce greatly the |
that runtime checking can be done. [This check is also applied to ONCE |
| 4992 |
stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO, |
groups at runtime, but in a different way.] |
| 4993 |
convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that |
|
| 4994 |
the default action below, of wrapping everything inside atomic brackets, |
Then, if the quantifier was possessive and the bracket is not a |
| 4995 |
does not happen. |
conditional, we convert the BRA code to the POS form, and the KET code to |
| 4996 |
|
KETRPOS. (It turns out to be convenient at runtime to detect this kind of |
| 4997 |
Then, when we are doing the actual compile phase, check to see whether |
subpattern at both the start and at the end.) The use of special opcodes |
| 4998 |
this group is one that could match an empty string. If so, convert the |
makes it possible to reduce greatly the stack usage in pcre_exec(). If |
| 4999 |
initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime |
the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. |
| 5000 |
checking can be done. [This check is also applied to ONCE groups at |
|
| 5001 |
runtime, but in a different way.] */ |
Then, if the minimum number of matches is 1 or 0, cancel the possessive |
| 5002 |
|
flag so that the default action below, of wrapping everything inside |
| 5003 |
|
atomic brackets, does not happen. When the minimum is greater than 1, |
| 5004 |
|
there will be earlier copies of the group, and so we still have to wrap |
| 5005 |
|
the whole thing. */ |
| 5006 |
|
|
| 5007 |
else |
else |
| 5008 |
{ |
{ |
| 5009 |
uschar *ketcode = code - 1 - LINK_SIZE; |
uschar *ketcode = code - 1 - LINK_SIZE; |
| 5010 |
uschar *bracode = ketcode - GET(ketcode, 1); |
uschar *bracode = ketcode - GET(ketcode, 1); |
| 5011 |
|
|
| 5012 |
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; |
/* Convert possessive ONCE brackets to non-capturing */ |
| 5013 |
if (*bracode == OP_ONCE) |
|
| 5014 |
|
if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && |
| 5015 |
|
possessive_quantifier) *bracode = OP_BRA; |
| 5016 |
|
|
| 5017 |
|
/* For non-possessive ONCE brackets, all we need to do is to |
| 5018 |
|
set the KET. */ |
| 5019 |
|
|
| 5020 |
|
if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC) |
| 5021 |
*ketcode = OP_KETRMAX + repeat_type; |
*ketcode = OP_KETRMAX + repeat_type; |
| 5022 |
|
|
| 5023 |
|
/* Handle non-ONCE brackets and possessive ONCEs (which have been |
| 5024 |
|
converted to non-capturing above). */ |
| 5025 |
|
|
| 5026 |
else |
else |
| 5027 |
{ |
{ |
| 5028 |
if (possessive_quantifier) |
/* In the compile phase, check for empty string matching. */ |
| 5029 |
{ |
|
|
*bracode += 1; /* Switch to xxxPOS opcodes */ |
|
|
*ketcode = OP_KETRPOS; |
|
|
if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; |
|
|
possessive_quantifier = FALSE; |
|
|
} |
|
|
else *ketcode = OP_KETRMAX + repeat_type; |
|
|
|
|
| 5030 |
if (lengthptr == NULL) |
if (lengthptr == NULL) |
| 5031 |
{ |
{ |
| 5032 |
uschar *scode = bracode; |
uschar *scode = bracode; |
| 5041 |
} |
} |
| 5042 |
while (*scode == OP_ALT); |
while (*scode == OP_ALT); |
| 5043 |
} |
} |
| 5044 |
|
|
| 5045 |
|
/* Handle possessive quantifiers. */ |
| 5046 |
|
|
| 5047 |
|
if (possessive_quantifier) |
| 5048 |
|
{ |
| 5049 |
|
/* For COND brackets, we wrap the whole thing in a possessively |
| 5050 |
|
repeated non-capturing bracket, because we have not invented POS |
| 5051 |
|
versions of the COND opcodes. Because we are moving code along, we |
| 5052 |
|
must ensure that any pending recursive references are updated. */ |
| 5053 |
|
|
| 5054 |
|
if (*bracode == OP_COND || *bracode == OP_SCOND) |
| 5055 |
|
{ |
| 5056 |
|
int nlen = (int)(code - bracode); |
| 5057 |
|
*code = OP_END; |
| 5058 |
|
adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm); |
| 5059 |
|
memmove(bracode + 1+LINK_SIZE, bracode, nlen); |
| 5060 |
|
code += 1 + LINK_SIZE; |
| 5061 |
|
nlen += 1 + LINK_SIZE; |
| 5062 |
|
*bracode = OP_BRAPOS; |
| 5063 |
|
*code++ = OP_KETRPOS; |
| 5064 |
|
PUTINC(code, 0, nlen); |
| 5065 |
|
PUT(bracode, 1, nlen); |
| 5066 |
|
} |
| 5067 |
|
|
| 5068 |
|
/* For non-COND brackets, we modify the BRA code and use KETRPOS. */ |
| 5069 |
|
|
| 5070 |
|
else |
| 5071 |
|
{ |
| 5072 |
|
*bracode += 1; /* Switch to xxxPOS opcodes */ |
| 5073 |
|
*ketcode = OP_KETRPOS; |
| 5074 |
|
} |
| 5075 |
|
|
| 5076 |
|
/* If the minimum is zero, mark it as possessive, then unset the |
| 5077 |
|
possessive flag when the minimum is 0 or 1. */ |
| 5078 |
|
|
| 5079 |
|
if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; |
| 5080 |
|
if (repeat_min < 2) possessive_quantifier = FALSE; |
| 5081 |
|
} |
| 5082 |
|
|
| 5083 |
|
/* Non-possessive quantifier */ |
| 5084 |
|
|
| 5085 |
|
else *ketcode = OP_KETRMAX + repeat_type; |
| 5086 |
} |
} |
| 5087 |
} |
} |
| 5088 |
} |
} |
| 5109 |
notation is just syntactic sugar, taken from Sun's Java package, but the |
notation is just syntactic sugar, taken from Sun's Java package, but the |
| 5110 |
special opcodes can optimize it. |
special opcodes can optimize it. |
| 5111 |
|
|
| 5112 |
Possessively repeated subpatterns have already been handled in the code |
Some (but not all) possessively repeated subpatterns have already been |
| 5113 |
just above, so possessive_quantifier is always FALSE for them at this |
completely handled in the code just above. For them, possessive_quantifier |
| 5114 |
stage. |
is always FALSE at this stage. |
| 5115 |
|
|
| 5116 |
Note that the repeated item starts at tempcode, not at previous, which |
Note that the repeated item starts at tempcode, not at previous, which |
| 5117 |
might be the first part of a string whose (former) last char we repeated. |
might be the first part of a string whose (former) last char we repeated. |
| 5260 |
PUT2INC(code, 0, oc->number); |
PUT2INC(code, 0, oc->number); |
| 5261 |
} |
} |
| 5262 |
*code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; |
*code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; |
| 5263 |
|
|
| 5264 |
|
/* Do not set firstbyte after *ACCEPT */ |
| 5265 |
|
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
| 5266 |
} |
} |
| 5267 |
|
|
| 5268 |
/* Handle other cases with/without an argument */ |
/* Handle other cases with/without an argument */ |
| 5275 |
goto FAILED; |
goto FAILED; |
| 5276 |
} |
} |
| 5277 |
*code = verbs[i].op; |
*code = verbs[i].op; |
| 5278 |
if (*code++ == OP_THEN) |
if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN; |
|
{ |
|
|
PUT(code, 0, code - bcptr->current_branch - 1); |
|
|
code += LINK_SIZE; |
|
|
} |
|
| 5279 |
} |
} |
| 5280 |
|
|
| 5281 |
else |
else |
| 5286 |
goto FAILED; |
goto FAILED; |
| 5287 |
} |
} |
| 5288 |
*code = verbs[i].op_arg; |
*code = verbs[i].op_arg; |
| 5289 |
if (*code++ == OP_THEN_ARG) |
if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN; |
|
{ |
|
|
PUT(code, 0, code - bcptr->current_branch - 1); |
|
|
code += LINK_SIZE; |
|
|
} |
|
| 5290 |
*code++ = arglen; |
*code++ = arglen; |
| 5291 |
memcpy(code, arg, arglen); |
memcpy(code, arg, arglen); |
| 5292 |
code += arglen; |
code += arglen; |
| 5601 |
|
|
| 5602 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
| 5603 |
case CHAR_C: /* Callout - may be followed by digits; */ |
case CHAR_C: /* Callout - may be followed by digits; */ |
| 5604 |
previous_callout = code; /* Save for later completion */ |
previous_callout = code; /* Save for later completion */ |
| 5605 |
after_manual_callout = 1; /* Skip one item before completing */ |
after_manual_callout = 1; /* Skip one item before completing */ |
| 5606 |
*code++ = OP_CALLOUT; |
*code++ = OP_CALLOUT; |
| 5607 |
{ |
{ |
| 5608 |
int n = 0; |
int n = 0; |
| 6121 |
*code = bravalue; |
*code = bravalue; |
| 6122 |
tempcode = code; |
tempcode = code; |
| 6123 |
tempreqvary = cd->req_varyopt; /* Save value before bracket */ |
tempreqvary = cd->req_varyopt; /* Save value before bracket */ |
| 6124 |
|
tempbracount = cd->bracount; /* Save value before bracket */ |
| 6125 |
length_prevgroup = 0; /* Initialize for pre-compile phase */ |
length_prevgroup = 0; /* Initialize for pre-compile phase */ |
| 6126 |
|
|
| 6127 |
if (!compile_regex( |
if (!compile_regex( |
| 6144 |
)) |
)) |
| 6145 |
goto FAILED; |
goto FAILED; |
| 6146 |
|
|
| 6147 |
|
/* If this was an atomic group and there are no capturing groups within it, |
| 6148 |
|
generate OP_ONCE_NC instead of OP_ONCE. */ |
| 6149 |
|
|
| 6150 |
|
if (bravalue == OP_ONCE && cd->bracount <= tempbracount) |
| 6151 |
|
*code = OP_ONCE_NC; |
| 6152 |
|
|
| 6153 |
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) |
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) |
| 6154 |
cd->assert_depth -= 1; |
cd->assert_depth -= 1; |
| 6155 |
|
|
| 6156 |
/* At the end of compiling, code is still pointing to the start of the |
/* At the end of compiling, code is still pointing to the start of the |
| 6157 |
group, while tempcode has been updated to point past the end of the group |
group, while tempcode has been updated to point past the end of the group. |
| 6158 |
and any option resetting that may follow it. The pattern pointer (ptr) |
The pattern pointer (ptr) is on the bracket. |
|
is on the bracket. */ |
|
| 6159 |
|
|
| 6160 |
/* If this is a conditional bracket, check that there are no more than |
If this is a conditional bracket, check that there are no more than |
| 6161 |
two branches in the group, or just one if it's a DEFINE group. We do this |
two branches in the group, or just one if it's a DEFINE group. We do this |
| 6162 |
in the real compile phase, not in the pre-pass, where the whole group may |
in the real compile phase, not in the pre-pass, where the whole group may |
| 6163 |
not be available. */ |
not be available. */ |
| 6479 |
} |
} |
| 6480 |
else |
else |
| 6481 |
#endif |
#endif |
| 6482 |
{ |
/* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE |
| 6483 |
|
so that it works in DFA mode and in lookbehinds. */ |
| 6484 |
|
|
| 6485 |
|
{ |
| 6486 |
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
| 6487 |
*code++ = -c; |
*code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c; |
| 6488 |
} |
} |
| 6489 |
} |
} |
| 6490 |
continue; |
continue; |
| 6559 |
else firstbyte = reqbyte = REQ_NONE; |
else firstbyte = reqbyte = REQ_NONE; |
| 6560 |
} |
} |
| 6561 |
|
|
| 6562 |
/* firstbyte was previously set; we can set reqbyte only the length is |
/* firstbyte was previously set; we can set reqbyte only if the length is |
| 6563 |
1 or the matching is caseful. */ |
1 or the matching is caseful. */ |
| 6564 |
|
|
| 6565 |
else |
else |
| 6776 |
} |
} |
| 6777 |
else if (fixed_length < 0) |
else if (fixed_length < 0) |
| 6778 |
{ |
{ |
| 6779 |
*errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; |
*errorcodeptr = (fixed_length == -2)? ERR36 : |
| 6780 |
|
(fixed_length == -4)? ERR70: ERR25; |
| 6781 |
*ptrptr = ptr; |
*ptrptr = ptr; |
| 6782 |
return FALSE; |
return FALSE; |
| 6783 |
} |
} |
| 6952 |
|
|
| 6953 |
/* Other brackets */ |
/* Other brackets */ |
| 6954 |
|
|
| 6955 |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC || |
| 6956 |
|
op == OP_COND) |
| 6957 |
{ |
{ |
| 6958 |
if (!is_anchored(scode, bracket_map, backref_map)) return FALSE; |
if (!is_anchored(scode, bracket_map, backref_map)) return FALSE; |
| 6959 |
} |
} |
| 7057 |
|
|
| 7058 |
/* Other brackets */ |
/* Other brackets */ |
| 7059 |
|
|
| 7060 |
else if (op == OP_ASSERT || op == OP_ONCE) |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC) |
| 7061 |
{ |
{ |
| 7062 |
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
| 7063 |
} |
} |
| 7127 |
case OP_SCBRAPOS: |
case OP_SCBRAPOS: |
| 7128 |
case OP_ASSERT: |
case OP_ASSERT: |
| 7129 |
case OP_ONCE: |
case OP_ONCE: |
| 7130 |
|
case OP_ONCE_NC: |
| 7131 |
case OP_COND: |
case OP_COND: |
| 7132 |
if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0) |
if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0) |
| 7133 |
return -1; |
return -1; |
| 7509 |
re->top_backref = cd->top_backref; |
re->top_backref = cd->top_backref; |
| 7510 |
re->flags = cd->external_flags; |
re->flags = cd->external_flags; |
| 7511 |
|
|
| 7512 |
if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ |
if (cd->had_accept) reqbyte = REQ_NONE; /* Must disable after (*ACCEPT) */ |
| 7513 |
|
|
| 7514 |
/* If not reached end of pattern on success, there's an excess bracket. */ |
/* If not reached end of pattern on success, there's an excess bracket. */ |
| 7515 |
|
|
| 7576 |
DPRINTF(("fixed length = %d\n", fixed_length)); |
DPRINTF(("fixed length = %d\n", fixed_length)); |
| 7577 |
if (fixed_length < 0) |
if (fixed_length < 0) |
| 7578 |
{ |
{ |
| 7579 |
errorcode = (fixed_length == -2)? ERR36 : ERR25; |
errorcode = (fixed_length == -2)? ERR36 : |
| 7580 |
|
(fixed_length == -4)? ERR70 : ERR25; |
| 7581 |
break; |
break; |
| 7582 |
} |
} |
| 7583 |
PUT(cc, 1, fixed_length); |
PUT(cc, 1, fixed_length); |