| 45 |
applications. */ |
applications. */ |
| 46 |
|
|
| 47 |
|
|
| 48 |
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved |
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved |
| 49 |
the performance of his patterns greatly. I could not use it as it stood, as it |
the performance of his patterns greatly. I could not use it as it stood, as it |
| 50 |
was not thread safe, and made assumptions about pattern sizes. Also, it caused |
was not thread safe, and made assumptions about pattern sizes. Also, it caused |
| 51 |
test 7 to loop, and test 9 to crash with a segfault. |
test 7 to loop, and test 9 to crash with a segfault. |
| 52 |
|
|
| 53 |
The issue is the check for duplicate states, which is done by a simple linear |
The issue is the check for duplicate states, which is done by a simple linear |
| 68 |
of internal_dfa_exec(). (The supplied patch used a static vector, initialized |
of internal_dfa_exec(). (The supplied patch used a static vector, initialized |
| 69 |
only once - I suspect this was the cause of the problems with the tests.) |
only once - I suspect this was the cause of the problems with the tests.) |
| 70 |
|
|
| 71 |
Overall, I concluded that the gains in some cases did not outweigh the losses |
Overall, I concluded that the gains in some cases did not outweigh the losses |
| 72 |
in others, so I abandoned this code. */ |
in others, so I abandoned this code. */ |
| 73 |
|
|
| 74 |
|
|
| 109 |
character that is to be tested in some way. This makes is possible to |
character that is to be tested in some way. This makes is possible to |
| 110 |
centralize the loading of these characters. In the case of Type * etc, the |
centralize the loading of these characters. In the case of Type * etc, the |
| 111 |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
| 112 |
small value. ***NOTE*** If the start of this table is modified, the two tables |
small value. Non-zero values in the table are the offsets from the opcode where |
| 113 |
that follow must also be modified. */ |
the character is to be found. ***NOTE*** If the start of this table is |
| 114 |
|
modified, the three tables that follow must also be modified. */ |
| 115 |
|
|
| 116 |
static const uschar coptable[] = { |
static const uschar coptable[] = { |
| 117 |
0, /* End */ |
0, /* End */ |
| 161 |
0, /* DEF */ |
0, /* DEF */ |
| 162 |
0, 0, /* BRAZERO, BRAMINZERO */ |
0, 0, /* BRAZERO, BRAMINZERO */ |
| 163 |
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ |
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ |
| 164 |
0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ |
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ |
| 165 |
|
}; |
| 166 |
|
|
| 167 |
|
/* This table identifies those opcodes that inspect a character. It is used to |
| 168 |
|
remember the fact that a character could have been inspected when the end of |
| 169 |
|
the subject is reached, in order to support PCRE_PARTIAL_HARD behaviour. |
| 170 |
|
***NOTE*** If the start of this table is modified, the two tables that follow |
| 171 |
|
must also be modified. */ |
| 172 |
|
|
| 173 |
|
static const uschar poptable[] = { |
| 174 |
|
0, /* End */ |
| 175 |
|
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ |
| 176 |
|
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ |
| 177 |
|
1, 1, 1, /* Any, AllAny, Anybyte */ |
| 178 |
|
1, 1, 1, /* NOTPROP, PROP, EXTUNI */ |
| 179 |
|
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ |
| 180 |
|
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ |
| 181 |
|
1, /* Char */ |
| 182 |
|
1, /* Charnc */ |
| 183 |
|
1, /* not */ |
| 184 |
|
/* Positive single-char repeats */ |
| 185 |
|
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ |
| 186 |
|
1, 1, 1, /* upto, minupto, exact */ |
| 187 |
|
1, 1, 1, 1, /* *+, ++, ?+, upto+ */ |
| 188 |
|
/* Negative single-char repeats - only for chars < 256 */ |
| 189 |
|
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ |
| 190 |
|
1, 1, 1, /* NOT upto, minupto, exact */ |
| 191 |
|
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ |
| 192 |
|
/* Positive type repeats */ |
| 193 |
|
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ |
| 194 |
|
1, 1, 1, /* Type upto, minupto, exact */ |
| 195 |
|
1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ |
| 196 |
|
/* Character class & ref repeats */ |
| 197 |
|
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ |
| 198 |
|
1, 1, /* CRRANGE, CRMINRANGE */ |
| 199 |
|
1, /* CLASS */ |
| 200 |
|
1, /* NCLASS */ |
| 201 |
|
1, /* XCLASS - variable length */ |
| 202 |
|
0, /* REF */ |
| 203 |
|
0, /* RECURSE */ |
| 204 |
|
0, /* CALLOUT */ |
| 205 |
|
0, /* Alt */ |
| 206 |
|
0, /* Ket */ |
| 207 |
|
0, /* KetRmax */ |
| 208 |
|
0, /* KetRmin */ |
| 209 |
|
0, /* Assert */ |
| 210 |
|
0, /* Assert not */ |
| 211 |
|
0, /* Assert behind */ |
| 212 |
|
0, /* Assert behind not */ |
| 213 |
|
0, /* Reverse */ |
| 214 |
|
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ |
| 215 |
|
0, 0, 0, /* SBRA, SCBRA, SCOND */ |
| 216 |
|
0, /* CREF */ |
| 217 |
|
0, /* RREF */ |
| 218 |
|
0, /* DEF */ |
| 219 |
|
0, 0, /* BRAZERO, BRAMINZERO */ |
| 220 |
|
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ |
| 221 |
|
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ |
| 222 |
}; |
}; |
| 223 |
|
|
| 224 |
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, |
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, |
| 475 |
current_subject - start_subject : max_back; |
current_subject - start_subject : max_back; |
| 476 |
current_subject -= gone_back; |
current_subject -= gone_back; |
| 477 |
} |
} |
| 478 |
|
|
| 479 |
/* Save the earliest consulted character */ |
/* Save the earliest consulted character */ |
| 480 |
|
|
| 481 |
if (current_subject < md->start_used_ptr) |
if (current_subject < md->start_used_ptr) |
| 482 |
md->start_used_ptr = current_subject; |
md->start_used_ptr = current_subject; |
| 483 |
|
|
| 484 |
/* Now we can process the individual branches. */ |
/* Now we can process the individual branches. */ |
| 485 |
|
|
| 546 |
int clen, dlen; |
int clen, dlen; |
| 547 |
unsigned int c, d; |
unsigned int c, d; |
| 548 |
int forced_fail = 0; |
int forced_fail = 0; |
| 549 |
int reached_end = 0; |
int reached_end = 0; |
| 550 |
|
BOOL could_continue = FALSE; |
| 551 |
|
|
| 552 |
/* Make the new state list into the active state list and empty the |
/* Make the new state list into the active state list and empty the |
| 553 |
new state list. */ |
new state list. */ |
| 637 |
} |
} |
| 638 |
} |
} |
| 639 |
|
|
| 640 |
/* Check for a duplicate state with the same count, and skip if found. |
/* Check for a duplicate state with the same count, and skip if found. |
| 641 |
See the note at the head of this module about the possibility of improving |
See the note at the head of this module about the possibility of improving |
| 642 |
performance here. */ |
performance here. */ |
| 643 |
|
|
| 655 |
|
|
| 656 |
code = start_code + state_offset; |
code = start_code + state_offset; |
| 657 |
codevalue = *code; |
codevalue = *code; |
| 658 |
|
|
| 659 |
|
/* If this opcode inspects a character, but we are at the end of the |
| 660 |
|
subject, remember the fact so that we can support PCRE_PARTIAL_HARD. */ |
| 661 |
|
|
| 662 |
|
if (clen == 0 && poptable[codevalue] != 0) |
| 663 |
|
could_continue = TRUE; |
| 664 |
|
|
| 665 |
/* If this opcode is followed by an inline character, load it. It is |
/* If this opcode is followed by an inline character, load it. It is |
| 666 |
tempting to test for the presence of a subject character here, but that |
tempting to test for the presence of a subject character here, but that |
| 712 |
/* ========================================================================== */ |
/* ========================================================================== */ |
| 713 |
/* Reached a closing bracket. If not at the end of the pattern, carry |
/* Reached a closing bracket. If not at the end of the pattern, carry |
| 714 |
on with the next opcode. Otherwise, unless we have an empty string and |
on with the next opcode. Otherwise, unless we have an empty string and |
| 715 |
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the |
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the |
| 716 |
start of the subject, save the match data, shifting up all previous |
start of the subject, save the match data, shifting up all previous |
| 717 |
matches so we always have the longest first. */ |
matches so we always have the longest first. */ |
| 718 |
|
|
| 727 |
ADD_ACTIVE(state_offset - GET(code, 1), 0); |
ADD_ACTIVE(state_offset - GET(code, 1), 0); |
| 728 |
} |
} |
| 729 |
} |
} |
| 730 |
else |
else |
| 731 |
{ |
{ |
| 732 |
reached_end++; /* Count branches that reach the end */ |
reached_end++; /* Count branches that reach the end */ |
| 733 |
if (ptr > current_subject || |
if (ptr > current_subject || |
| 734 |
((md->moptions & PCRE_NOTEMPTY) == 0 && |
((md->moptions & PCRE_NOTEMPTY) == 0 && |
| 735 |
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || |
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || |
| 736 |
current_subject > start_subject + md->start_offset))) |
current_subject > start_subject + md->start_offset))) |
| 754 |
match_count, rlevel*2-2, SP)); |
match_count, rlevel*2-2, SP)); |
| 755 |
return match_count; |
return match_count; |
| 756 |
} |
} |
| 757 |
} |
} |
| 758 |
} |
} |
| 759 |
break; |
break; |
| 760 |
|
|
| 904 |
if (ptr > start_subject) |
if (ptr > start_subject) |
| 905 |
{ |
{ |
| 906 |
const uschar *temp = ptr - 1; |
const uschar *temp = ptr - 1; |
| 907 |
if (temp < md->start_used_ptr) md->start_used_ptr = temp; |
if (temp < md->start_used_ptr) md->start_used_ptr = temp; |
| 908 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 909 |
if (utf8) BACKCHAR(temp); |
if (utf8) BACKCHAR(temp); |
| 910 |
#endif |
#endif |
| 913 |
} |
} |
| 914 |
else left_word = 0; |
else left_word = 0; |
| 915 |
|
|
| 916 |
if (clen > 0) |
if (clen > 0) |
| 917 |
right_word = c < 256 && (ctypes[c] & ctype_word) != 0; |
right_word = c < 256 && (ctypes[c] & ctype_word) != 0; |
| 918 |
else /* This is a fudge to ensure that if this is the */ |
else /* This is a fudge to ensure that if this is the */ |
| 919 |
{ /* last item in the pattern, we don't count it as */ |
{ /* last item in the pattern, we don't count it as */ |
| 920 |
reached_end--; /* reached, thus disabling a partial match. */ |
reached_end--; /* reached, thus disabling a partial match. */ |
| 921 |
right_word = 0; |
right_word = 0; |
| 922 |
} |
} |
| 923 |
|
|
| 924 |
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) |
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) |
| 925 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
| 2352 |
|
|
| 2353 |
/* Back reference conditions are not supported */ |
/* Back reference conditions are not supported */ |
| 2354 |
|
|
| 2355 |
if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; |
if (condcode == OP_CREF || condcode == OP_NCREF) |
| 2356 |
|
return PCRE_ERROR_DFA_UCOND; |
| 2357 |
|
|
| 2358 |
/* The DEFINE condition is always false */ |
/* The DEFINE condition is always false */ |
| 2359 |
|
|
| 2364 |
which means "test if in any recursion". We can't test for specifically |
which means "test if in any recursion". We can't test for specifically |
| 2365 |
recursed groups. */ |
recursed groups. */ |
| 2366 |
|
|
| 2367 |
else if (condcode == OP_RREF) |
else if (condcode == OP_RREF || condcode == OP_NRREF) |
| 2368 |
{ |
{ |
| 2369 |
int value = GET2(code, LINK_SIZE+2); |
int value = GET2(code, LINK_SIZE+2); |
| 2370 |
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; |
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; |
| 2587 |
/* We have finished the processing at the current subject character. If no |
/* We have finished the processing at the current subject character. If no |
| 2588 |
new states have been set for the next character, we have found all the |
new states have been set for the next character, we have found all the |
| 2589 |
matches that we are going to find. If we are at the top level and partial |
matches that we are going to find. If we are at the top level and partial |
| 2590 |
matching has been requested, check for appropriate conditions. The "forced_ |
matching has been requested, check for appropriate conditions. |
| 2591 |
fail" variable counts the number of (*F) encountered for the character. If it |
|
| 2592 |
is equal to the original active_count (saved in workspace[1]) it means that |
The "forced_ fail" variable counts the number of (*F) encountered for the |
| 2593 |
(*F) was found on every active state. In this case we don't want to give a |
character. If it is equal to the original active_count (saved in |
| 2594 |
partial match. */ |
workspace[1]) it means that (*F) was found on every active state. In this |
| 2595 |
|
case we don't want to give a partial match. |
| 2596 |
|
|
| 2597 |
|
The "reached_end" variable counts the number of threads that have reached the |
| 2598 |
|
end of the pattern. The "could_continue" variable is true if a thread could |
| 2599 |
|
have continued but for the fact that the end of the subject was reached. */ |
| 2600 |
|
|
| 2601 |
if (new_count <= 0) |
if (new_count <= 0) |
| 2602 |
{ |
{ |
| 2603 |
if (rlevel == 1 && /* Top level, and */ |
if (rlevel == 1 && /* Top level, and */ |
| 2604 |
reached_end != workspace[1] && /* Not all reached end */ |
( /* either... */ |
| 2605 |
|
reached_end != workspace[1] || /* Not all reached end */ |
| 2606 |
|
could_continue /* or some could go on */ |
| 2607 |
|
) && /* and... */ |
| 2608 |
forced_fail != workspace[1] && /* Not all forced fail & */ |
forced_fail != workspace[1] && /* Not all forced fail & */ |
| 2609 |
( /* either... */ |
( /* either... */ |
| 2610 |
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ |
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ |
| 2725 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
| 2726 |
md->tables = extra_data->tables; |
md->tables = extra_data->tables; |
| 2727 |
} |
} |
| 2728 |
|
|
| 2729 |
/* Check that the first field in the block is the magic number. If it is not, |
/* Check that the first field in the block is the magic number. If it is not, |
| 2730 |
test for a regex that was compiled on a host of opposite endianness. If this is |
test for a regex that was compiled on a host of opposite endianness. If this is |
| 2731 |
the case, flipped values are put in internal_re and internal_study if there was |
the case, flipped values are put in internal_re and internal_study if there was |
| 2987 |
|
|
| 2988 |
end_subject = save_end_subject; |
end_subject = save_end_subject; |
| 2989 |
|
|
| 2990 |
/* The following two optimizations are disabled for partial matching or if |
/* The following two optimizations are disabled for partial matching or if |
| 2991 |
disabling is explicitly requested (and of course, by the test above, this |
disabling is explicitly requested (and of course, by the test above, this |
| 2992 |
code is not obeyed when restarting after a partial match). */ |
code is not obeyed when restarting after a partial match). */ |
| 2993 |
|
|
| 2994 |
if ((options & PCRE_NO_START_OPTIMIZE) == 0 && |
if ((options & PCRE_NO_START_OPTIMIZE) == 0 && |
| 2995 |
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) |
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) |
| 2996 |
{ |
{ |
| 2997 |
/* If the pattern was studied, a minimum subject length may be set. This |
/* If the pattern was studied, a minimum subject length may be set. This |
| 2998 |
is a lower bound; no actual string of that length may actually match the |
is a lower bound; no actual string of that length may actually match the |
| 2999 |
pattern. Although the value is, strictly, in characters, we treat it as |
pattern. Although the value is, strictly, in characters, we treat it as |
| 3002 |
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && |
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && |
| 3003 |
end_subject - current_subject < study->minlength) |
end_subject - current_subject < study->minlength) |
| 3004 |
return PCRE_ERROR_NOMATCH; |
return PCRE_ERROR_NOMATCH; |
| 3005 |
|
|
| 3006 |
/* If req_byte is set, we know that that character must appear in the |
/* If req_byte is set, we know that that character must appear in the |
| 3007 |
subject for the match to succeed. If the first character is set, req_byte |
subject for the match to succeed. If the first character is set, req_byte |
| 3008 |
must be later in the subject; otherwise the test starts at the match |
must be later in the subject; otherwise the test starts at the match |
| 3010 |
nested unlimited repeats that aren't going to match. Writing separate |
nested unlimited repeats that aren't going to match. Writing separate |
| 3011 |
code for cased/caseless versions makes it go faster, as does using an |
code for cased/caseless versions makes it go faster, as does using an |
| 3012 |
autoincrement and backing off on a match. |
autoincrement and backing off on a match. |
| 3013 |
|
|
| 3014 |
HOWEVER: when the subject string is very, very long, searching to its end |
HOWEVER: when the subject string is very, very long, searching to its end |
| 3015 |
can take a long time, and give bad performance on quite ordinary |
can take a long time, and give bad performance on quite ordinary |
| 3016 |
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte |
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte |
| 3017 |
string... so we don't do this when the string is sufficiently long. */ |
string... so we don't do this when the string is sufficiently long. */ |
| 3018 |
|
|
| 3019 |
if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX) |
if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX) |
| 3020 |
{ |
{ |
| 3021 |
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); |
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); |
| 3022 |
|
|
| 3023 |
/* We don't need to repeat the search if we haven't yet reached the |
/* We don't need to repeat the search if we haven't yet reached the |
| 3024 |
place we found it at last time. */ |
place we found it at last time. */ |
| 3025 |
|
|
| 3026 |
if (p > req_byte_ptr) |
if (p > req_byte_ptr) |
| 3027 |
{ |
{ |
| 3028 |
if (req_byte_caseless) |
if (req_byte_caseless) |
| 3040 |
if (*p++ == req_byte) { p--; break; } |
if (*p++ == req_byte) { p--; break; } |
| 3041 |
} |
} |
| 3042 |
} |
} |
| 3043 |
|
|
| 3044 |
/* If we can't find the required character, break the matching loop, |
/* If we can't find the required character, break the matching loop, |
| 3045 |
which will cause a return or PCRE_ERROR_NOMATCH. */ |
which will cause a return or PCRE_ERROR_NOMATCH. */ |
| 3046 |
|
|
| 3047 |
if (p >= end_subject) break; |
if (p >= end_subject) break; |
| 3048 |
|
|
| 3049 |
/* If we have found the required character, save the point where we |
/* If we have found the required character, save the point where we |
| 3050 |
found it, so that we don't search again next time round the loop if |
found it, so that we don't search again next time round the loop if |
| 3051 |
the start hasn't passed this character yet. */ |
the start hasn't passed this character yet. */ |
| 3052 |
|
|
| 3053 |
req_byte_ptr = p; |
req_byte_ptr = p; |
| 3054 |
} |
} |
| 3055 |
} |
} |
| 3056 |
} |
} |
| 3057 |
} /* End of optimizations that are done when not restarting */ |
} /* End of optimizations that are done when not restarting */ |
| 3058 |
|
|
| 3059 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
| 3060 |
|
|
| 3061 |
md->start_used_ptr = current_subject; |
md->start_used_ptr = current_subject; |
| 3062 |
|
|
| 3063 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
| 3064 |
md, /* fixed match data */ |
md, /* fixed match data */ |
| 3065 |
md->start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |