| 45 |
applications. */ |
applications. */ |
| 46 |
|
|
| 47 |
|
|
| 48 |
|
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved |
| 49 |
|
the performance of his patterns greatly. I could not use it as it stood, as it |
| 50 |
|
was not thread safe, and made assumptions about pattern sizes. Also, it caused |
| 51 |
|
test 7 to loop, and test 9 to crash with a segfault. |
| 52 |
|
|
| 53 |
|
The issue is the check for duplicate states, which is done by a simple linear |
| 54 |
|
search up the state list. (Grep for "duplicate" below to find the code.) For |
| 55 |
|
many patterns, there will never be many states active at one time, so a simple |
| 56 |
|
linear search is fine. In patterns that have many active states, it might be a |
| 57 |
|
bottleneck. The suggested code used an indexing scheme to remember which states |
| 58 |
|
had previously been used for each character, and avoided the linear search when |
| 59 |
|
it knew there was no chance of a duplicate. This was implemented when adding |
| 60 |
|
states to the state lists. |
| 61 |
|
|
| 62 |
|
I wrote some thread-safe, not-limited code to try something similar at the time |
| 63 |
|
of checking for duplicates (instead of when adding states), using index vectors |
| 64 |
|
on the stack. It did give a 13% improvement with one specially constructed |
| 65 |
|
pattern for certain subject strings, but on other strings and on many of the |
| 66 |
|
simpler patterns in the test suite it did worse. The major problem, I think, |
| 67 |
|
was the extra time to initialize the index. This had to be done for each call |
| 68 |
|
of internal_dfa_exec(). (The supplied patch used a static vector, initialized |
| 69 |
|
only once - I suspect this was the cause of the problems with the tests.) |
| 70 |
|
|
| 71 |
|
Overall, I concluded that the gains in some cases did not outweigh the losses |
| 72 |
|
in others, so I abandoned this code. */ |
| 73 |
|
|
| 74 |
|
|
| 75 |
|
|
| 76 |
#ifdef HAVE_CONFIG_H |
#ifdef HAVE_CONFIG_H |
| 77 |
#include "config.h" |
#include "config.h" |
| 78 |
#endif |
#endif |
| 417 |
current_subject - start_subject : max_back; |
current_subject - start_subject : max_back; |
| 418 |
current_subject -= gone_back; |
current_subject -= gone_back; |
| 419 |
} |
} |
| 420 |
|
|
| 421 |
|
/* Save the earliest consulted character */ |
| 422 |
|
|
| 423 |
|
if (current_subject < md->start_used_ptr) |
| 424 |
|
md->start_used_ptr = current_subject; |
| 425 |
|
|
| 426 |
/* Now we can process the individual branches. */ |
/* Now we can process the individual branches. */ |
| 427 |
|
|
| 578 |
} |
} |
| 579 |
} |
} |
| 580 |
|
|
| 581 |
/* Check for a duplicate state with the same count, and skip if found. */ |
/* Check for a duplicate state with the same count, and skip if found. |
| 582 |
|
See the note at the head of this module about the possibility of improving |
| 583 |
|
performance here. */ |
| 584 |
|
|
| 585 |
for (j = 0; j < i; j++) |
for (j = 0; j < i; j++) |
| 586 |
{ |
{ |
| 647 |
/* ========================================================================== */ |
/* ========================================================================== */ |
| 648 |
/* Reached a closing bracket. If not at the end of the pattern, carry |
/* Reached a closing bracket. If not at the end of the pattern, carry |
| 649 |
on with the next opcode. Otherwise, unless we have an empty string and |
on with the next opcode. Otherwise, unless we have an empty string and |
| 650 |
PCRE_NOTEMPTY is set, save the match data, shifting up all previous |
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the |
| 651 |
|
start of the subject, save the match data, shifting up all previous |
| 652 |
matches so we always have the longest first. */ |
matches so we always have the longest first. */ |
| 653 |
|
|
| 654 |
case OP_KET: |
case OP_KET: |
| 665 |
else |
else |
| 666 |
{ |
{ |
| 667 |
reached_end++; /* Count branches that reach the end */ |
reached_end++; /* Count branches that reach the end */ |
| 668 |
if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) |
if (ptr > current_subject || |
| 669 |
|
((md->moptions & PCRE_NOTEMPTY) == 0 && |
| 670 |
|
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || |
| 671 |
|
current_subject > start_subject + md->start_offset))) |
| 672 |
{ |
{ |
| 673 |
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; |
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; |
| 674 |
else if (match_count > 0 && ++match_count * 2 >= offsetcount) |
else if (match_count > 0 && ++match_count * 2 >= offsetcount) |
| 839 |
if (ptr > start_subject) |
if (ptr > start_subject) |
| 840 |
{ |
{ |
| 841 |
const uschar *temp = ptr - 1; |
const uschar *temp = ptr - 1; |
| 842 |
|
if (temp < md->start_used_ptr) md->start_used_ptr = temp; |
| 843 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 844 |
if (utf8) BACKCHAR(temp); |
if (utf8) BACKCHAR(temp); |
| 845 |
#endif |
#endif |
| 2543 |
{ |
{ |
| 2544 |
if (offsetcount >= 2) |
if (offsetcount >= 2) |
| 2545 |
{ |
{ |
| 2546 |
offsets[0] = current_subject - start_subject; |
offsets[0] = md->start_used_ptr - start_subject; |
| 2547 |
offsets[1] = end_subject - start_subject; |
offsets[1] = end_subject - start_subject; |
| 2548 |
} |
} |
| 2549 |
match_count = PCRE_ERROR_PARTIAL; |
match_count = PCRE_ERROR_PARTIAL; |
| 2685 |
re->name_table_offset + re->name_count * re->name_entry_size; |
re->name_table_offset + re->name_count * re->name_entry_size; |
| 2686 |
md->start_subject = (const unsigned char *)subject; |
md->start_subject = (const unsigned char *)subject; |
| 2687 |
md->end_subject = end_subject; |
md->end_subject = end_subject; |
| 2688 |
|
md->start_offset = start_offset; |
| 2689 |
md->moptions = options; |
md->moptions = options; |
| 2690 |
md->poptions = re->options; |
md->poptions = re->options; |
| 2691 |
|
|
| 2977 |
|
|
| 2978 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
| 2979 |
|
|
| 2980 |
|
md->start_used_ptr = current_subject; |
| 2981 |
|
|
| 2982 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
| 2983 |
md, /* fixed match data */ |
md, /* fixed match data */ |
| 2984 |
md->start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |