| 9 |
|
|
| 10 |
Written by: Philip Hazel <ph10@cam.ac.uk> |
Written by: Philip Hazel <ph10@cam.ac.uk> |
| 11 |
|
|
| 12 |
Copyright (c) 1997 University of Cambridge |
Copyright (c) 1998 University of Cambridge |
| 13 |
|
|
| 14 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 15 |
Permission is granted to anyone to use this software for any purpose on any |
Permission is granted to anyone to use this software for any purpose on any |
| 49 |
#include "internal.h" |
#include "internal.h" |
| 50 |
|
|
| 51 |
|
|
| 52 |
|
/* Allow compilation as C++ source code, should anybody want to do that. */ |
| 53 |
|
|
| 54 |
|
#ifdef __cplusplus |
| 55 |
|
#define class pcre_class |
| 56 |
|
#endif |
| 57 |
|
|
| 58 |
|
|
| 59 |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
| 60 |
|
|
| 61 |
static char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
| 62 |
static char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
| 63 |
|
|
| 64 |
/* Text forms of OP_ values and things, for debugging */ |
/* Text forms of OP_ values and things, for debugging (not all used) */ |
| 65 |
|
|
| 66 |
#ifdef DEBUG |
#ifdef DEBUG |
| 67 |
static const char *OP_names[] = { |
static const char *OP_names[] = { |
| 72 |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
| 73 |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
| 74 |
"*", "*?", "+", "+?", "?", "??", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", |
| 75 |
"class", "Ref", |
"class", "negclass", "Ref", |
| 76 |
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once", |
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once", |
| 77 |
"Brazero", "Braminzero", "Bra" |
"Brazero", "Braminzero", "Bra" |
| 78 |
}; |
}; |
| 83 |
on. Zero means further processing is needed (for things like \x), or the escape |
on. Zero means further processing is needed (for things like \x), or the escape |
| 84 |
is invalid. */ |
is invalid. */ |
| 85 |
|
|
| 86 |
static short int escapes[] = { |
static const short int escapes[] = { |
| 87 |
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
| 88 |
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
| 89 |
'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */ |
'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */ |
| 98 |
|
|
| 99 |
/* Definition to allow mutual recursion */ |
/* Definition to allow mutual recursion */ |
| 100 |
|
|
| 101 |
static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **); |
static BOOL |
| 102 |
|
compile_regex(int, int *, uschar **, const uschar **, const char **); |
| 103 |
|
|
| 104 |
/* Structure for passing "static" information around between the functions |
/* Structure for passing "static" information around between the functions |
| 105 |
doing the matching, so that they are thread-safe. */ |
doing the matching, so that they are thread-safe. */ |
| 264 |
case OP_KETRMIN: |
case OP_KETRMIN: |
| 265 |
return TRUE; |
return TRUE; |
| 266 |
|
|
| 267 |
|
/* Skip over entire bracket groups with zero lower bound */ |
| 268 |
|
|
| 269 |
|
case OP_BRAZERO: |
| 270 |
|
case OP_BRAMINZERO: |
| 271 |
|
cc++; |
| 272 |
|
/* Fall through */ |
| 273 |
|
|
| 274 |
/* Skip over assertive subpatterns */ |
/* Skip over assertive subpatterns */ |
| 275 |
|
|
| 276 |
case OP_ASSERT: |
case OP_ASSERT: |
| 285 |
case OP_EOD: |
case OP_EOD: |
| 286 |
case OP_CIRC: |
case OP_CIRC: |
| 287 |
case OP_DOLL: |
case OP_DOLL: |
|
case OP_BRAZERO: |
|
|
case OP_BRAMINZERO: |
|
| 288 |
case OP_NOT_WORD_BOUNDARY: |
case OP_NOT_WORD_BOUNDARY: |
| 289 |
case OP_WORD_BOUNDARY: |
case OP_WORD_BOUNDARY: |
| 290 |
cc++; |
cc++; |
| 319 |
/* Check a class or a back reference for a zero minimum */ |
/* Check a class or a back reference for a zero minimum */ |
| 320 |
|
|
| 321 |
case OP_CLASS: |
case OP_CLASS: |
| 322 |
|
case OP_NEGCLASS: |
| 323 |
case OP_REF: |
case OP_REF: |
| 324 |
cc += (*cc == OP_REF)? 2 : 33; |
cc += (*cc == OP_REF)? 2 : 33; |
| 325 |
|
|
| 623 |
int repeat_type, op_type; |
int repeat_type, op_type; |
| 624 |
int repeat_min, repeat_max; |
int repeat_min, repeat_max; |
| 625 |
int bravalue, length; |
int bravalue, length; |
| 626 |
|
int greedy_default, greedy_non_default; |
| 627 |
register int c; |
register int c; |
| 628 |
register uschar *code = *codeptr; |
register uschar *code = *codeptr; |
| 629 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
| 631 |
uschar *previous = NULL; |
uschar *previous = NULL; |
| 632 |
uschar class[32]; |
uschar class[32]; |
| 633 |
|
|
| 634 |
|
/* Set up the default and non-default settings for greediness */ |
| 635 |
|
|
| 636 |
|
greedy_default = ((options & PCRE_UNGREEDY) != 0); |
| 637 |
|
greedy_non_default = greedy_default ^ 1; |
| 638 |
|
|
| 639 |
/* Switch on next character until the end of the branch */ |
/* Switch on next character until the end of the branch */ |
| 640 |
|
|
| 641 |
for (;; ptr++) |
for (;; ptr++) |
| 690 |
|
|
| 691 |
case '[': |
case '[': |
| 692 |
previous = code; |
previous = code; |
|
*code++ = OP_CLASS; |
|
| 693 |
|
|
| 694 |
/* If the first character is '^', set the negation flag */ |
/* If the first character is '^', set the negation flag, and use a |
| 695 |
|
different opcode. This only matters if caseless matching is specified at |
| 696 |
|
runtime. */ |
| 697 |
|
|
| 698 |
if ((c = *(++ptr)) == '^') |
if ((c = *(++ptr)) == '^') |
| 699 |
{ |
{ |
| 700 |
negate_class = TRUE; |
negate_class = TRUE; |
| 701 |
|
*code++ = OP_NEGCLASS; |
| 702 |
c = *(++ptr); |
c = *(++ptr); |
| 703 |
} |
} |
| 704 |
else negate_class = FALSE; |
else |
| 705 |
|
{ |
| 706 |
|
negate_class = FALSE; |
| 707 |
|
*code++ = OP_CLASS; |
| 708 |
|
} |
| 709 |
|
|
| 710 |
/* Keep a count of chars so that we can optimize the case of just a single |
/* Keep a count of chars so that we can optimize the case of just a single |
| 711 |
character. */ |
character. */ |
| 913 |
goto FAILED; |
goto FAILED; |
| 914 |
} |
} |
| 915 |
|
|
| 916 |
/* If the next character is '?' this is a minimizing repeat. Advance to the |
/* If the next character is '?' this is a minimizing repeat, by default, |
| 917 |
|
but if PCRE_UNGREEDY is set, it works the other way round. Advance to the |
| 918 |
next character. */ |
next character. */ |
| 919 |
|
|
| 920 |
if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0; |
if (ptr[1] == '?') |
| 921 |
|
{ repeat_type = greedy_non_default; ptr++; } |
| 922 |
|
else repeat_type = greedy_default; |
| 923 |
|
|
| 924 |
/* If the maximum is zero then the minimum must also be zero; Perl allows |
/* If the maximum is zero then the minimum must also be zero; Perl allows |
| 925 |
this case, so we do too - by simply omitting the item altogether. */ |
this case, so we do too - by simply omitting the item altogether. */ |
| 1008 |
/* If the mininum is 1 and the previous item was a character string, |
/* If the mininum is 1 and the previous item was a character string, |
| 1009 |
we either have to put back the item that got cancelled if the string |
we either have to put back the item that got cancelled if the string |
| 1010 |
length was 1, or add the character back onto the end of a longer |
length was 1, or add the character back onto the end of a longer |
| 1011 |
string. For a character type nothing need be done; it will just get put |
string. For a character type nothing need be done; it will just get |
| 1012 |
back naturally. */ |
put back naturally. Note that the final character is always going to |
| 1013 |
|
get added below. */ |
| 1014 |
|
|
| 1015 |
else if (*previous == OP_CHARS) |
else if (*previous == OP_CHARS) |
| 1016 |
{ |
{ |
| 1017 |
if (code == previous) code += 2; else previous[1]++; |
if (code == previous) code += 2; else previous[1]++; |
| 1018 |
} |
} |
| 1019 |
|
|
| 1020 |
|
/* For a single negated character we also have to put back the |
| 1021 |
|
item that got cancelled. */ |
| 1022 |
|
|
| 1023 |
|
else if (*previous == OP_NOT) code++; |
| 1024 |
|
|
| 1025 |
/* If the maximum is unlimited, insert an OP_STAR. */ |
/* If the maximum is unlimited, insert an OP_STAR. */ |
| 1026 |
|
|
| 1027 |
if (repeat_max < 0) |
if (repeat_max < 0) |
| 1050 |
/* If previous was a character class or a back reference, we put the repeat |
/* If previous was a character class or a back reference, we put the repeat |
| 1051 |
stuff after it. */ |
stuff after it. */ |
| 1052 |
|
|
| 1053 |
else if (*previous == OP_CLASS || *previous == OP_REF) |
else if (*previous == OP_CLASS || *previous == OP_NEGCLASS || |
| 1054 |
|
*previous == OP_REF) |
| 1055 |
{ |
{ |
| 1056 |
if (repeat_min == 0 && repeat_max == -1) |
if (repeat_min == 0 && repeat_max == -1) |
| 1057 |
*code++ = OP_CRSTAR + repeat_type; |
*code++ = OP_CRSTAR + repeat_type; |
| 1164 |
case 'm': |
case 'm': |
| 1165 |
case 's': |
case 's': |
| 1166 |
case 'x': |
case 'x': |
| 1167 |
|
case 'U': |
| 1168 |
|
case 'X': |
| 1169 |
ptr++; |
ptr++; |
| 1170 |
while (*ptr != ')') ptr++; |
while (*ptr != ')') ptr++; |
| 1171 |
previous = NULL; |
previous = NULL; |
| 1325 |
the next state. */ |
the next state. */ |
| 1326 |
|
|
| 1327 |
previous[1] = length; |
previous[1] = length; |
| 1328 |
ptr--; |
if (length < 255) ptr--; |
| 1329 |
break; |
break; |
| 1330 |
} |
} |
| 1331 |
} /* end of big loop */ |
} /* end of big loop */ |
| 1769 |
ptr += 2; |
ptr += 2; |
| 1770 |
break; |
break; |
| 1771 |
} |
} |
| 1772 |
/* Else fall thourh */ |
/* Else fall through */ |
| 1773 |
|
|
| 1774 |
/* Else loop setting valid options until ) is met. Anything else is an |
/* Else loop setting valid options until ) is met. Anything else is an |
| 1775 |
error. */ |
error. */ |
| 1799 |
length -= spaces; /* Already counted spaces */ |
length -= spaces; /* Already counted spaces */ |
| 1800 |
continue; |
continue; |
| 1801 |
} |
} |
| 1802 |
|
else if (c == 'X') |
| 1803 |
|
{ |
| 1804 |
|
options |= PCRE_EXTRA; |
| 1805 |
|
continue; |
| 1806 |
|
} |
| 1807 |
|
else if (c == 'U') |
| 1808 |
|
{ |
| 1809 |
|
options |= PCRE_UNGREEDY; |
| 1810 |
|
continue; |
| 1811 |
|
} |
| 1812 |
else if (c == ')') break; |
else if (c == ')') break; |
| 1813 |
|
|
| 1814 |
*errorptr = ERR12; |
*errorptr = ERR12; |
| 2014 |
|
|
| 2015 |
if (re->options != 0) |
if (re->options != 0) |
| 2016 |
{ |
{ |
| 2017 |
printf("%s%s%s%s%s%s%s\n", |
printf("%s%s%s%s%s%s%s%s\n", |
| 2018 |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
| 2019 |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
| 2020 |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
| 2021 |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
| 2022 |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
| 2023 |
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "", |
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "", |
| 2024 |
((re->options & PCRE_EXTRA) != 0)? "extra " : ""); |
((re->options & PCRE_EXTRA) != 0)? "extra " : "", |
| 2025 |
|
((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : ""); |
| 2026 |
} |
} |
| 2027 |
|
|
| 2028 |
if ((re->options & PCRE_FIRSTSET) != 0) |
if ((re->options & PCRE_FIRSTSET) != 0) |
| 2139 |
goto CLASS_REF_REPEAT; |
goto CLASS_REF_REPEAT; |
| 2140 |
|
|
| 2141 |
case OP_CLASS: |
case OP_CLASS: |
| 2142 |
|
case OP_NEGCLASS: |
| 2143 |
{ |
{ |
| 2144 |
int i, min, max; |
int i, min, max; |
| 2145 |
|
|
| 2146 |
code++; |
if (*code++ == OP_CLASS) printf(" ["); |
| 2147 |
printf(" ["); |
else printf(" ^["); |
| 2148 |
|
|
| 2149 |
for (i = 0; i < 256; i++) |
for (i = 0; i < 256; i++) |
| 2150 |
{ |
{ |
| 2764 |
item to see if there is repeat information following. Then obey similar |
item to see if there is repeat information following. Then obey similar |
| 2765 |
code to character type repeats - written out again for speed. If caseless |
code to character type repeats - written out again for speed. If caseless |
| 2766 |
matching was set at runtime but not at compile time, we have to check both |
matching was set at runtime but not at compile time, we have to check both |
| 2767 |
versions of a character. */ |
versions of a character, and we have to behave differently for positive and |
| 2768 |
|
negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are |
| 2769 |
|
treated differently. */ |
| 2770 |
|
|
| 2771 |
case OP_CLASS: |
case OP_CLASS: |
| 2772 |
|
case OP_NEGCLASS: |
| 2773 |
{ |
{ |
| 2774 |
|
BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless; |
| 2775 |
const uschar *data = ecode + 1; /* Save for matching */ |
const uschar *data = ecode + 1; /* Save for matching */ |
| 2776 |
ecode += 33; /* Advance past the item */ |
ecode += 33; /* Advance past the item */ |
| 2777 |
|
|
| 2800 |
break; |
break; |
| 2801 |
|
|
| 2802 |
default: /* No repeat follows */ |
default: /* No repeat follows */ |
| 2803 |
if (eptr >= md->end_subject) return FALSE; |
min = max = 1; |
| 2804 |
c = *eptr++; |
break; |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ |
|
|
if (md->runtime_caseless) |
|
|
{ |
|
|
c = pcre_fcc[c]; |
|
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ |
|
|
} |
|
|
return FALSE; |
|
| 2805 |
} |
} |
| 2806 |
|
|
| 2807 |
/* First, ensure the minimum number of matches are present. */ |
/* First, ensure the minimum number of matches are present. */ |
| 2810 |
{ |
{ |
| 2811 |
if (eptr >= md->end_subject) return FALSE; |
if (eptr >= md->end_subject) return FALSE; |
| 2812 |
c = *eptr++; |
c = *eptr++; |
| 2813 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
|
| 2814 |
if (md->runtime_caseless) |
/* Either not runtime caseless, or it was a positive class. For |
| 2815 |
|
runtime caseless, continue if either case is in the map. */ |
| 2816 |
|
|
| 2817 |
|
if (!nasty_case) |
| 2818 |
{ |
{ |
| 2819 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2820 |
|
if (md->runtime_caseless) |
| 2821 |
|
{ |
| 2822 |
|
c = pcre_fcc[c]; |
| 2823 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2824 |
|
} |
| 2825 |
|
} |
| 2826 |
|
|
| 2827 |
|
/* Runtime caseless and it was a negative class. Continue only if |
| 2828 |
|
both cases are in the map. */ |
| 2829 |
|
|
| 2830 |
|
else |
| 2831 |
|
{ |
| 2832 |
|
if ((data[c/8] & (1 << (c&7))) == 0) return FALSE; |
| 2833 |
c = pcre_fcc[c]; |
c = pcre_fcc[c]; |
| 2834 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2835 |
} |
} |
| 2836 |
|
|
| 2837 |
return FALSE; |
return FALSE; |
| 2838 |
} |
} |
| 2839 |
|
|
| 2852 |
if (match(eptr, ecode, offset_top, md)) return TRUE; |
if (match(eptr, ecode, offset_top, md)) return TRUE; |
| 2853 |
if (i >= max || eptr >= md->end_subject) return FALSE; |
if (i >= max || eptr >= md->end_subject) return FALSE; |
| 2854 |
c = *eptr++; |
c = *eptr++; |
| 2855 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
|
| 2856 |
if (md->runtime_caseless) |
/* Either not runtime caseless, or it was a positive class. For |
| 2857 |
|
runtime caseless, continue if either case is in the map. */ |
| 2858 |
|
|
| 2859 |
|
if (!nasty_case) |
| 2860 |
{ |
{ |
| 2861 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2862 |
|
if (md->runtime_caseless) |
| 2863 |
|
{ |
| 2864 |
|
c = pcre_fcc[c]; |
| 2865 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2866 |
|
} |
| 2867 |
|
} |
| 2868 |
|
|
| 2869 |
|
/* Runtime caseless and it was a negative class. Continue only if |
| 2870 |
|
both cases are in the map. */ |
| 2871 |
|
|
| 2872 |
|
else |
| 2873 |
|
{ |
| 2874 |
|
if ((data[c/8] & (1 << (c&7))) == 0) return FALSE; |
| 2875 |
c = pcre_fcc[c]; |
c = pcre_fcc[c]; |
| 2876 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2877 |
} |
} |
| 2878 |
|
|
| 2879 |
return FALSE; |
return FALSE; |
| 2880 |
} |
} |
| 2881 |
/* Control never gets here */ |
/* Control never gets here */ |
| 2890 |
{ |
{ |
| 2891 |
if (eptr >= md->end_subject) break; |
if (eptr >= md->end_subject) break; |
| 2892 |
c = *eptr; |
c = *eptr; |
| 2893 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
|
| 2894 |
if (md->runtime_caseless) |
/* Either not runtime caseless, or it was a positive class. For |
| 2895 |
|
runtime caseless, continue if either case is in the map. */ |
| 2896 |
|
|
| 2897 |
|
if (!nasty_case) |
| 2898 |
{ |
{ |
| 2899 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2900 |
|
if (md->runtime_caseless) |
| 2901 |
|
{ |
| 2902 |
|
c = pcre_fcc[c]; |
| 2903 |
|
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2904 |
|
} |
| 2905 |
|
} |
| 2906 |
|
|
| 2907 |
|
/* Runtime caseless and it was a negative class. Continue only if |
| 2908 |
|
both cases are in the map. */ |
| 2909 |
|
|
| 2910 |
|
else |
| 2911 |
|
{ |
| 2912 |
|
if ((data[c/8] & (1 << (c&7))) == 0) break; |
| 2913 |
c = pcre_fcc[c]; |
c = pcre_fcc[c]; |
| 2914 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
| 2915 |
} |
} |
| 2916 |
|
|
| 2917 |
break; |
break; |
| 2918 |
} |
} |
| 2919 |
|
|
| 3490 |
if (re->top_backref > 0 && re->top_backref >= ocount/2) |
if (re->top_backref > 0 && re->top_backref >= ocount/2) |
| 3491 |
{ |
{ |
| 3492 |
ocount = re->top_backref * 2 + 2; |
ocount = re->top_backref * 2 + 2; |
| 3493 |
match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int)); |
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); |
| 3494 |
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; |
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; |
| 3495 |
using_temporary_offsets = TRUE; |
using_temporary_offsets = TRUE; |
| 3496 |
DPRINTF(("Got memory to hold back references\n")); |
DPRINTF(("Got memory to hold back references\n")); |