| 43 |
compatible, but it has advantages in certain applications. */ |
compatible, but it has advantages in certain applications. */ |
| 44 |
|
|
| 45 |
|
|
| 46 |
|
#define NLBLOCK md /* The block containing newline information */ |
| 47 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
| 48 |
|
|
| 49 |
|
|
| 424 |
for (;;) |
for (;;) |
| 425 |
{ |
{ |
| 426 |
int i, j; |
int i, j; |
| 427 |
int c, d, clen, dlen; |
int clen, dlen; |
| 428 |
|
unsigned int c, d; |
| 429 |
|
|
| 430 |
/* Make the new state list into the active state list and empty the |
/* Make the new state list into the active state list and empty the |
| 431 |
new state list. */ |
new state list. */ |
| 649 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 650 |
case OP_CIRC: |
case OP_CIRC: |
| 651 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
| 652 |
((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE)) |
((ims & PCRE_MULTILINE) != 0 && |
| 653 |
|
ptr >= start_subject + md->nllen && |
| 654 |
|
ptr != end_subject && |
| 655 |
|
IS_NEWLINE(ptr - md->nllen))) |
| 656 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 657 |
break; |
break; |
| 658 |
|
|
| 686 |
|
|
| 687 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 688 |
case OP_ANY: |
case OP_ANY: |
| 689 |
if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0)) |
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || |
| 690 |
|
ptr > end_subject - md->nllen || |
| 691 |
|
!IS_NEWLINE(ptr))) |
| 692 |
{ ADD_NEW(state_offset + 1, 0); } |
{ ADD_NEW(state_offset + 1, 0); } |
| 693 |
break; |
break; |
| 694 |
|
|
| 695 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 696 |
case OP_EODN: |
case OP_EODN: |
| 697 |
if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject)) |
if (clen == 0 || |
| 698 |
|
(ptr == end_subject - md->nllen && IS_NEWLINE(ptr))) |
| 699 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 700 |
break; |
break; |
| 701 |
|
|
| 703 |
case OP_DOLL: |
case OP_DOLL: |
| 704 |
if ((md->moptions & PCRE_NOTEOL) == 0) |
if ((md->moptions & PCRE_NOTEOL) == 0) |
| 705 |
{ |
{ |
| 706 |
if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject || |
if (clen == 0 || |
| 707 |
(ims & PCRE_MULTILINE) != 0))) |
(ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) && |
| 708 |
|
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
| 709 |
|
)) |
| 710 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 711 |
} |
} |
| 712 |
else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0) |
else if ((ims & PCRE_MULTILINE) != 0 && |
| 713 |
|
ptr <= end_subject - md->nllen && IS_NEWLINE(ptr)) |
| 714 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 715 |
break; |
break; |
| 716 |
|
|
| 822 |
{ |
{ |
| 823 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 824 |
(c < 256 && |
(c < 256 && |
| 825 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 826 |
|
(ims & PCRE_DOTALL) != 0 || |
| 827 |
|
ptr > end_subject - md->nllen || |
| 828 |
|
!IS_NEWLINE(ptr) |
| 829 |
|
) && |
| 830 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 831 |
{ |
{ |
| 832 |
count++; |
count++; |
| 843 |
{ |
{ |
| 844 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 845 |
(c < 256 && |
(c < 256 && |
| 846 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 847 |
|
(ims & PCRE_DOTALL) != 0 || |
| 848 |
|
ptr > end_subject - md->nllen || |
| 849 |
|
!IS_NEWLINE(ptr) |
| 850 |
|
) && |
| 851 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 852 |
{ |
{ |
| 853 |
ADD_NEW(state_offset + 2, 0); |
ADD_NEW(state_offset + 2, 0); |
| 863 |
{ |
{ |
| 864 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 865 |
(c < 256 && |
(c < 256 && |
| 866 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 867 |
|
(ims & PCRE_DOTALL) != 0 || |
| 868 |
|
ptr > end_subject - md->nllen || |
| 869 |
|
!IS_NEWLINE(ptr) |
| 870 |
|
) && |
| 871 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 872 |
{ |
{ |
| 873 |
ADD_NEW(state_offset, 0); |
ADD_NEW(state_offset, 0); |
| 886 |
{ |
{ |
| 887 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
| 888 |
(c < 256 && |
(c < 256 && |
| 889 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
| 890 |
|
(ims & PCRE_DOTALL) != 0 || |
| 891 |
|
ptr > end_subject - md->nllen || |
| 892 |
|
!IS_NEWLINE(ptr) |
| 893 |
|
) && |
| 894 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
| 895 |
{ |
{ |
| 896 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
| 1247 |
if (clen > 0) |
if (clen > 0) |
| 1248 |
{ |
{ |
| 1249 |
int otherd = -1; |
int otherd = -1; |
| 1250 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1251 |
{ |
{ |
| 1252 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1253 |
if (utf8 && d >= 128) |
if (utf8 && d >= 128) |
| 1274 |
if (clen > 0) |
if (clen > 0) |
| 1275 |
{ |
{ |
| 1276 |
int otherd = -1; |
int otherd = -1; |
| 1277 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
| 1278 |
{ |
{ |
| 1279 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 1280 |
if (utf8 && d >= 128) |
if (utf8 && d >= 128) |
| 1397 |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
| 1398 |
if (isinclass) |
if (isinclass) |
| 1399 |
{ |
{ |
| 1400 |
if (++count >= GET2(ecode, 3)) |
int max = GET2(ecode, 3); |
| 1401 |
|
if (++count >= max && max != 0) /* Max 0 => no limit */ |
| 1402 |
{ ADD_NEW(next_state_offset + 5, 0); } |
{ ADD_NEW(next_state_offset + 5, 0); } |
| 1403 |
else |
else |
| 1404 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
| 1698 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
| 1699 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
| 1700 |
rlevel*2-2, SP)); |
rlevel*2-2, SP)); |
| 1701 |
return match_count; |
break; /* In effect, "return", but see the comment below */ |
| 1702 |
} |
} |
| 1703 |
|
|
| 1704 |
/* One or more states are active for the next character. */ |
/* One or more states are active for the next character. */ |
| 1706 |
ptr += clen; /* Advance to next subject character */ |
ptr += clen; /* Advance to next subject character */ |
| 1707 |
} /* Loop to move along the subject string */ |
} /* Loop to move along the subject string */ |
| 1708 |
|
|
| 1709 |
/* Control never gets here, but we must keep the compiler happy. */ |
/* Control gets here from "break" a few lines above. We do it this way because |
| 1710 |
|
if we use "return" above, we have compiler trouble. Some compilers warn if |
| 1711 |
|
there's nothing here because they think the function doesn't return a value. On |
| 1712 |
|
the other hand, if we put a dummy statement here, some more clever compilers |
| 1713 |
|
complain that it can't be reached. Sigh. */ |
| 1714 |
|
|
| 1715 |
DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n" |
return match_count; |
|
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP)); |
|
|
return PCRE_ERROR_NOMATCH; |
|
| 1716 |
} |
} |
| 1717 |
|
|
| 1718 |
|
|
| 1751 |
{ |
{ |
| 1752 |
real_pcre *re = (real_pcre *)argument_re; |
real_pcre *re = (real_pcre *)argument_re; |
| 1753 |
dfa_match_data match_block; |
dfa_match_data match_block; |
| 1754 |
|
dfa_match_data *md = &match_block; |
| 1755 |
BOOL utf8, anchored, startline, firstline; |
BOOL utf8, anchored, startline, firstline; |
| 1756 |
const uschar *current_subject, *end_subject, *lcc; |
const uschar *current_subject, *end_subject, *lcc; |
| 1757 |
|
|
| 1766 |
int first_byte = -1; |
int first_byte = -1; |
| 1767 |
int req_byte = -1; |
int req_byte = -1; |
| 1768 |
int req_byte2 = -1; |
int req_byte2 = -1; |
| 1769 |
|
int newline; |
| 1770 |
|
|
| 1771 |
/* Plausibility checks */ |
/* Plausibility checks */ |
| 1772 |
|
|
| 1781 |
match block, so we must initialize them beforehand. However, the other fields |
match block, so we must initialize them beforehand. However, the other fields |
| 1782 |
in the match block must not be set until after the byte flipping. */ |
in the match block must not be set until after the byte flipping. */ |
| 1783 |
|
|
| 1784 |
match_block.tables = re->tables; |
md->tables = re->tables; |
| 1785 |
match_block.callout_data = NULL; |
md->callout_data = NULL; |
| 1786 |
|
|
| 1787 |
if (extra_data != NULL) |
if (extra_data != NULL) |
| 1788 |
{ |
{ |
| 1793 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
| 1794 |
return PCRE_ERROR_DFA_UMLIMIT; |
return PCRE_ERROR_DFA_UMLIMIT; |
| 1795 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
| 1796 |
match_block.callout_data = extra_data->callout_data; |
md->callout_data = extra_data->callout_data; |
| 1797 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
| 1798 |
match_block.tables = extra_data->tables; |
md->tables = extra_data->tables; |
| 1799 |
} |
} |
| 1800 |
|
|
| 1801 |
/* Check that the first field in the block is the magic number. If it is not, |
/* Check that the first field in the block is the magic number. If it is not, |
| 1816 |
end_subject = (const unsigned char *)subject + length; |
end_subject = (const unsigned char *)subject + length; |
| 1817 |
req_byte_ptr = current_subject - 1; |
req_byte_ptr = current_subject - 1; |
| 1818 |
|
|
| 1819 |
|
#ifdef SUPPORT_UTF8 |
| 1820 |
utf8 = (re->options & PCRE_UTF8) != 0; |
utf8 = (re->options & PCRE_UTF8) != 0; |
| 1821 |
|
#else |
| 1822 |
|
utf8 = FALSE; |
| 1823 |
|
#endif |
| 1824 |
|
|
| 1825 |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
| 1826 |
(re->options & PCRE_ANCHORED) != 0; |
(re->options & PCRE_ANCHORED) != 0; |
| 1827 |
|
|
| 1828 |
/* The remaining fixed data for passing around. */ |
/* The remaining fixed data for passing around. */ |
| 1829 |
|
|
| 1830 |
match_block.start_code = (const uschar *)argument_re + |
md->start_code = (const uschar *)argument_re + |
| 1831 |
re->name_table_offset + re->name_count * re->name_entry_size; |
re->name_table_offset + re->name_count * re->name_entry_size; |
| 1832 |
match_block.start_subject = (const unsigned char *)subject; |
md->start_subject = (const unsigned char *)subject; |
| 1833 |
match_block.end_subject = end_subject; |
md->end_subject = end_subject; |
| 1834 |
match_block.moptions = options; |
md->moptions = options; |
| 1835 |
match_block.poptions = re->options; |
md->poptions = re->options; |
| 1836 |
|
|
| 1837 |
|
/* Handle different types of newline. The two bits give four cases. If nothing |
| 1838 |
|
is set at run time, whatever was used at compile time applies. */ |
| 1839 |
|
|
| 1840 |
|
switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & |
| 1841 |
|
PCRE_NEWLINE_CRLF) |
| 1842 |
|
{ |
| 1843 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
| 1844 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
| 1845 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
| 1846 |
|
case PCRE_NEWLINE_CR+ |
| 1847 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
| 1848 |
|
} |
| 1849 |
|
|
| 1850 |
|
if (newline > 255) |
| 1851 |
|
{ |
| 1852 |
|
md->nllen = 2; |
| 1853 |
|
md->nl[0] = (newline >> 8) & 255; |
| 1854 |
|
md->nl[1] = newline & 255; |
| 1855 |
|
} |
| 1856 |
|
else |
| 1857 |
|
{ |
| 1858 |
|
md->nllen = 1; |
| 1859 |
|
md->nl[0] = newline; |
| 1860 |
|
} |
| 1861 |
|
|
| 1862 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
| 1863 |
back the character offset. */ |
back the character offset. */ |
| 1883 |
is a feature that makes it possible to save compiled regex and re-use them |
is a feature that makes it possible to save compiled regex and re-use them |
| 1884 |
in other programs later. */ |
in other programs later. */ |
| 1885 |
|
|
| 1886 |
if (match_block.tables == NULL) match_block.tables = _pcre_default_tables; |
if (md->tables == NULL) md->tables = _pcre_default_tables; |
| 1887 |
|
|
| 1888 |
/* The lower casing table and the "must be at the start of a line" flag are |
/* The lower casing table and the "must be at the start of a line" flag are |
| 1889 |
used in a loop when finding where to start. */ |
used in a loop when finding where to start. */ |
| 1890 |
|
|
| 1891 |
lcc = match_block.tables + lcc_offset; |
lcc = md->tables + lcc_offset; |
| 1892 |
startline = (re->options & PCRE_STARTLINE) != 0; |
startline = (re->options & PCRE_STARTLINE) != 0; |
| 1893 |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
| 1894 |
|
|
| 1921 |
{ |
{ |
| 1922 |
req_byte = re->req_byte & 255; |
req_byte = re->req_byte & 255; |
| 1923 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
| 1924 |
req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */ |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
| 1925 |
} |
} |
| 1926 |
|
|
| 1927 |
/* Call the main matching function, looping for a non-anchored regex after a |
/* Call the main matching function, looping for a non-anchored regex after a |
| 1946 |
if (firstline) |
if (firstline) |
| 1947 |
{ |
{ |
| 1948 |
const uschar *t = current_subject; |
const uschar *t = current_subject; |
| 1949 |
while (t < save_end_subject && *t != '\n') t++; |
while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; |
| 1950 |
end_subject = t; |
end_subject = t; |
| 1951 |
} |
} |
| 1952 |
|
|
| 1961 |
current_subject++; |
current_subject++; |
| 1962 |
} |
} |
| 1963 |
|
|
| 1964 |
/* Or to just after \n for a multiline match if possible */ |
/* Or to just after a linebreak for a multiline match if possible */ |
| 1965 |
|
|
| 1966 |
else if (startline) |
else if (startline) |
| 1967 |
{ |
{ |
| 1968 |
if (current_subject > match_block.start_subject + start_offset) |
if (current_subject > md->start_subject + md->nllen + |
| 1969 |
|
start_offset) |
| 1970 |
{ |
{ |
| 1971 |
while (current_subject < end_subject && current_subject[-1] != NEWLINE) |
while (current_subject <= end_subject && |
| 1972 |
|
!IS_NEWLINE(current_subject - md->nllen)) |
| 1973 |
current_subject++; |
current_subject++; |
| 1974 |
} |
} |
| 1975 |
} |
} |
| 2050 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
| 2051 |
|
|
| 2052 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
| 2053 |
&match_block, /* fixed match data */ |
md, /* fixed match data */ |
| 2054 |
match_block.start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |
| 2055 |
current_subject, /* where we currently are */ |
current_subject, /* where we currently are */ |
| 2056 |
start_offset, /* start offset in subject */ |
start_offset, /* start offset in subject */ |
| 2057 |
offsets, /* offset vector */ |
offsets, /* offset vector */ |
| 2058 |
offsetcount, /* size of same */ |
offsetcount, /* size of same */ |
| 2059 |
workspace, /* workspace vector */ |
workspace, /* workspace vector */ |
| 2060 |
wscount, /* size of same */ |
wscount, /* size of same */ |
| 2061 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
| 2062 |
0, /* function recurse level */ |
0, /* function recurse level */ |
| 2063 |
0); /* regex recurse level */ |
0); /* regex recurse level */ |
| 2064 |
|
|
| 2065 |
/* Anything other than "no match" means we are done, always; otherwise, carry |
/* Anything other than "no match" means we are done, always; otherwise, carry |
| 2066 |
on only if not anchored. */ |
on only if not anchored. */ |
| 2070 |
/* Advance to the next subject character unless we are at the end of a line |
/* Advance to the next subject character unless we are at the end of a line |
| 2071 |
and firstline is set. */ |
and firstline is set. */ |
| 2072 |
|
|
| 2073 |
if (firstline && *current_subject == NEWLINE) break; |
if (firstline && |
| 2074 |
|
current_subject <= end_subject - md->nllen && |
| 2075 |
|
IS_NEWLINE(current_subject)) break; |
| 2076 |
current_subject++; |
current_subject++; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
| 2077 |
if (utf8) |
if (utf8) |
| 2078 |
{ |
{ |
| 2079 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
| 2080 |
current_subject++; |
current_subject++; |
| 2081 |
} |
} |
|
#endif |
|
|
|
|
| 2082 |
if (current_subject > end_subject) break; |
if (current_subject > end_subject) break; |
| 2083 |
} |
} |
| 2084 |
|
|