| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2008 University of Cambridge |
Copyright (c) 1997-2009 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 302 |
"(*VERB) not recognized\0" |
"(*VERB) not recognized\0" |
| 303 |
"number is too big\0" |
"number is too big\0" |
| 304 |
"subpattern name expected\0" |
"subpattern name expected\0" |
| 305 |
"digit expected after (?+"; |
"digit expected after (?+\0" |
| 306 |
|
"] is an invalid data character in JavaScript compatibility mode"; |
| 307 |
|
|
| 308 |
|
|
| 309 |
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
| 455 |
find_error_text(int n) |
find_error_text(int n) |
| 456 |
{ |
{ |
| 457 |
const char *s = error_texts; |
const char *s = error_texts; |
| 458 |
for (; n > 0; n--) while (*s++ != 0); |
for (; n > 0; n--) while (*s++ != 0) {}; |
| 459 |
return s; |
return s; |
| 460 |
} |
} |
| 461 |
|
|
| 533 |
break; |
break; |
| 534 |
|
|
| 535 |
/* \g must be followed by one of a number of specific things: |
/* \g must be followed by one of a number of specific things: |
| 536 |
|
|
| 537 |
(1) A number, either plain or braced. If positive, it is an absolute |
(1) A number, either plain or braced. If positive, it is an absolute |
| 538 |
backreference. If negative, it is a relative backreference. This is a Perl |
backreference. If negative, it is a relative backreference. This is a Perl |
| 539 |
5.10 feature. |
5.10 feature. |
| 540 |
|
|
| 541 |
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
| 542 |
is part of Perl's movement towards a unified syntax for back references. As |
is part of Perl's movement towards a unified syntax for back references. As |
| 543 |
this is synonymous with \k{name}, we fudge it up by pretending it really |
this is synonymous with \k{name}, we fudge it up by pretending it really |
| 544 |
was \k. |
was \k. |
| 545 |
|
|
| 546 |
(3) For Oniguruma compatibility we also support \g followed by a name or a |
(3) For Oniguruma compatibility we also support \g followed by a name or a |
| 547 |
number either in angle brackets or in single quotes. However, these are |
number either in angle brackets or in single quotes. However, these are |
| 548 |
(possibly recursive) subroutine calls, _not_ backreferences. Just return |
(possibly recursive) subroutine calls, _not_ backreferences. Just return |
| 549 |
the -ESC_g code (cf \k). */ |
the -ESC_g code (cf \k). */ |
| 550 |
|
|
| 551 |
case 'g': |
case 'g': |
| 552 |
if (ptr[1] == '<' || ptr[1] == '\'') |
if (ptr[1] == '<' || ptr[1] == '\'') |
| 553 |
{ |
{ |
| 554 |
c = -ESC_g; |
c = -ESC_g; |
| 555 |
break; |
break; |
| 556 |
} |
} |
| 557 |
|
|
| 558 |
/* Handle the Perl-compatible cases */ |
/* Handle the Perl-compatible cases */ |
| 559 |
|
|
| 560 |
if (ptr[1] == '{') |
if (ptr[1] == '{') |
| 561 |
{ |
{ |
| 562 |
const uschar *p; |
const uschar *p; |
| 588 |
*errorcodeptr = ERR61; |
*errorcodeptr = ERR61; |
| 589 |
break; |
break; |
| 590 |
} |
} |
| 591 |
|
|
| 592 |
if (braced && *(++ptr) != '}') |
if (braced && *(++ptr) != '}') |
| 593 |
{ |
{ |
| 594 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 595 |
break; |
break; |
| 596 |
} |
} |
| 597 |
|
|
| 598 |
if (c == 0) |
if (c == 0) |
| 599 |
{ |
{ |
| 600 |
*errorcodeptr = ERR58; |
*errorcodeptr = ERR58; |
| 601 |
break; |
break; |
| 602 |
} |
} |
| 603 |
|
|
| 604 |
if (negated) |
if (negated) |
| 605 |
{ |
{ |
| 976 |
|
|
| 977 |
Arguments: |
Arguments: |
| 978 |
ptr current position in the pattern |
ptr current position in the pattern |
| 979 |
count current count of capturing parens so far encountered |
cd compile background data |
| 980 |
name name to seek, or NULL if seeking a numbered subpattern |
name name to seek, or NULL if seeking a numbered subpattern |
| 981 |
lorn name length, or subpattern number if name is NULL |
lorn name length, or subpattern number if name is NULL |
| 982 |
xmode TRUE if we are in /x mode |
xmode TRUE if we are in /x mode |
| 985 |
*/ |
*/ |
| 986 |
|
|
| 987 |
static int |
static int |
| 988 |
find_parens(const uschar *ptr, int count, const uschar *name, int lorn, |
find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, |
| 989 |
BOOL xmode) |
BOOL xmode) |
| 990 |
{ |
{ |
| 991 |
const uschar *thisname; |
const uschar *thisname; |
| 992 |
|
int count = cd->bracount; |
| 993 |
|
|
| 994 |
for (; *ptr != 0; ptr++) |
for (; *ptr != 0; ptr++) |
| 995 |
{ |
{ |
| 1002 |
if (*(++ptr) == 0) return -1; |
if (*(++ptr) == 0) return -1; |
| 1003 |
if (*ptr == 'Q') for (;;) |
if (*ptr == 'Q') for (;;) |
| 1004 |
{ |
{ |
| 1005 |
while (*(++ptr) != 0 && *ptr != '\\'); |
while (*(++ptr) != 0 && *ptr != '\\') {}; |
| 1006 |
if (*ptr == 0) return -1; |
if (*ptr == 0) return -1; |
| 1007 |
if (*(++ptr) == 'E') break; |
if (*(++ptr) == 'E') break; |
| 1008 |
} |
} |
| 1009 |
continue; |
continue; |
| 1010 |
} |
} |
| 1011 |
|
|
| 1012 |
/* Skip over character classes */ |
/* Skip over character classes; this logic must be similar to the way they |
| 1013 |
|
are handled for real. If the first character is '^', skip it. Also, if the |
| 1014 |
|
first few characters (either before or after ^) are \Q\E or \E we skip them |
| 1015 |
|
too. This makes for compatibility with Perl. */ |
| 1016 |
|
|
| 1017 |
if (*ptr == '[') |
if (*ptr == '[') |
| 1018 |
{ |
{ |
| 1019 |
|
BOOL negate_class = FALSE; |
| 1020 |
|
for (;;) |
| 1021 |
|
{ |
| 1022 |
|
int c = *(++ptr); |
| 1023 |
|
if (c == '\\') |
| 1024 |
|
{ |
| 1025 |
|
if (ptr[1] == 'E') ptr++; |
| 1026 |
|
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
| 1027 |
|
else break; |
| 1028 |
|
} |
| 1029 |
|
else if (!negate_class && c == '^') |
| 1030 |
|
negate_class = TRUE; |
| 1031 |
|
else break; |
| 1032 |
|
} |
| 1033 |
|
|
| 1034 |
|
/* If the next character is ']', it is a data character that must be |
| 1035 |
|
skipped, except in JavaScript compatibility mode. */ |
| 1036 |
|
|
| 1037 |
|
if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) |
| 1038 |
|
ptr++; |
| 1039 |
|
|
| 1040 |
while (*(++ptr) != ']') |
while (*(++ptr) != ']') |
| 1041 |
{ |
{ |
| 1042 |
if (*ptr == 0) return -1; |
if (*ptr == 0) return -1; |
| 1045 |
if (*(++ptr) == 0) return -1; |
if (*(++ptr) == 0) return -1; |
| 1046 |
if (*ptr == 'Q') for (;;) |
if (*ptr == 'Q') for (;;) |
| 1047 |
{ |
{ |
| 1048 |
while (*(++ptr) != 0 && *ptr != '\\'); |
while (*(++ptr) != 0 && *ptr != '\\') {}; |
| 1049 |
if (*ptr == 0) return -1; |
if (*ptr == 0) return -1; |
| 1050 |
if (*(++ptr) == 'E') break; |
if (*(++ptr) == 'E') break; |
| 1051 |
} |
} |
| 1059 |
|
|
| 1060 |
if (xmode && *ptr == '#') |
if (xmode && *ptr == '#') |
| 1061 |
{ |
{ |
| 1062 |
while (*(++ptr) != 0 && *ptr != '\n'); |
while (*(++ptr) != 0 && *ptr != '\n') {}; |
| 1063 |
if (*ptr == 0) return -1; |
if (*ptr == 0) return -1; |
| 1064 |
continue; |
continue; |
| 1065 |
} |
} |
| 1301 |
case OP_NOT_WORDCHAR: |
case OP_NOT_WORDCHAR: |
| 1302 |
case OP_WORDCHAR: |
case OP_WORDCHAR: |
| 1303 |
case OP_ANY: |
case OP_ANY: |
| 1304 |
|
case OP_ALLANY: |
| 1305 |
branchlength++; |
branchlength++; |
| 1306 |
cc++; |
cc++; |
| 1307 |
break; |
break; |
| 1450 |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
| 1451 |
break; |
break; |
| 1452 |
} |
} |
| 1453 |
|
#else |
| 1454 |
|
(void)(utf8); /* Keep compiler happy by referencing function argument */ |
| 1455 |
#endif |
#endif |
| 1456 |
} |
} |
| 1457 |
} |
} |
| 1545 |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
| 1546 |
break; |
break; |
| 1547 |
} |
} |
| 1548 |
|
#else |
| 1549 |
|
(void)(utf8); /* Keep compiler happy by referencing function argument */ |
| 1550 |
#endif |
#endif |
| 1551 |
} |
} |
| 1552 |
} |
} |
| 1598 |
|
|
| 1599 |
/* Groups with zero repeats can of course be empty; skip them. */ |
/* Groups with zero repeats can of course be empty; skip them. */ |
| 1600 |
|
|
| 1601 |
if (c == OP_BRAZERO || c == OP_BRAMINZERO) |
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) |
| 1602 |
{ |
{ |
| 1603 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1604 |
do code += GET(code, 1); while (*code == OP_ALT); |
do code += GET(code, 1); while (*code == OP_ALT); |
| 1684 |
case OP_NOT_WORDCHAR: |
case OP_NOT_WORDCHAR: |
| 1685 |
case OP_WORDCHAR: |
case OP_WORDCHAR: |
| 1686 |
case OP_ANY: |
case OP_ANY: |
| 1687 |
|
case OP_ALLANY: |
| 1688 |
case OP_ANYBYTE: |
case OP_ANYBYTE: |
| 1689 |
case OP_CHAR: |
case OP_CHAR: |
| 1690 |
case OP_CHARNC: |
case OP_CHARNC: |
| 1879 |
that is referenced. This means that groups can be replicated for fixed |
that is referenced. This means that groups can be replicated for fixed |
| 1880 |
repetition simply by copying (because the recursion is allowed to refer to |
repetition simply by copying (because the recursion is allowed to refer to |
| 1881 |
earlier groups that are outside the current group). However, when a group is |
earlier groups that are outside the current group). However, when a group is |
| 1882 |
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before |
optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is |
| 1883 |
it, after it has been compiled. This means that any OP_RECURSE items within it |
inserted before it, after it has been compiled. This means that any OP_RECURSE |
| 1884 |
that refer to the group itself or any contained groups have to have their |
items within it that refer to the group itself or any contained groups have to |
| 1885 |
offsets adjusted. That one of the jobs of this function. Before it is called, |
have their offsets adjusted. That one of the jobs of this function. Before it |
| 1886 |
the partially compiled regex must be temporarily terminated with OP_END. |
is called, the partially compiled regex must be temporarily terminated with |
| 1887 |
|
OP_END. |
| 1888 |
|
|
| 1889 |
This function has been extended with the possibility of forward references for |
This function has been extended with the possibility of forward references for |
| 1890 |
recursions and subroutine calls. It must also check the list of such references |
recursions and subroutine calls. It must also check the list of such references |
| 2019 |
unsigned int c, othercase, next; |
unsigned int c, othercase, next; |
| 2020 |
|
|
| 2021 |
for (c = *cptr; c <= d; c++) |
for (c = *cptr; c <= d; c++) |
| 2022 |
{ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } |
{ if ((othercase = UCD_OTHERCASE(c)) != c) break; } |
| 2023 |
|
|
| 2024 |
if (c > d) return FALSE; |
if (c > d) return FALSE; |
| 2025 |
|
|
| 2028 |
|
|
| 2029 |
for (++c; c <= d; c++) |
for (++c; c <= d; c++) |
| 2030 |
{ |
{ |
| 2031 |
if (_pcre_ucp_othercase(c) != next) break; |
if (UCD_OTHERCASE(c) != next) break; |
| 2032 |
next++; |
next++; |
| 2033 |
} |
} |
| 2034 |
|
|
| 2138 |
case OP_CHAR: |
case OP_CHAR: |
| 2139 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 2140 |
if (utf8 && item > 127) { GETCHAR(item, utf8_char); } |
if (utf8 && item > 127) { GETCHAR(item, utf8_char); } |
| 2141 |
|
#else |
| 2142 |
|
(void)(utf8_char); /* Keep compiler happy by referencing function argument */ |
| 2143 |
#endif |
#endif |
| 2144 |
return item != next; |
return item != next; |
| 2145 |
|
|
| 2158 |
unsigned int othercase; |
unsigned int othercase; |
| 2159 |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
| 2160 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 2161 |
othercase = _pcre_ucp_othercase((unsigned int)next); |
othercase = UCD_OTHERCASE((unsigned int)next); |
| 2162 |
#else |
#else |
| 2163 |
othercase = NOTACHAR; |
othercase = NOTACHAR; |
| 2164 |
#endif |
#endif |
| 2179 |
unsigned int othercase; |
unsigned int othercase; |
| 2180 |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
| 2181 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 2182 |
othercase = _pcre_ucp_othercase(next); |
othercase = UCD_OTHERCASE(next); |
| 2183 |
#else |
#else |
| 2184 |
othercase = NOTACHAR; |
othercase = NOTACHAR; |
| 2185 |
#endif |
#endif |
| 2673 |
zerofirstbyte = firstbyte; |
zerofirstbyte = firstbyte; |
| 2674 |
zeroreqbyte = reqbyte; |
zeroreqbyte = reqbyte; |
| 2675 |
previous = code; |
previous = code; |
| 2676 |
*code++ = OP_ANY; |
*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; |
| 2677 |
break; |
break; |
| 2678 |
|
|
| 2679 |
|
|
| 2688 |
opcode is compiled. It may optionally have a bit map for characters < 256, |
opcode is compiled. It may optionally have a bit map for characters < 256, |
| 2689 |
but those above are are explicitly listed afterwards. A flag byte tells |
but those above are are explicitly listed afterwards. A flag byte tells |
| 2690 |
whether the bitmap is present, and whether this is a negated class or not. |
whether the bitmap is present, and whether this is a negated class or not. |
| 2691 |
*/ |
|
| 2692 |
|
In JavaScript compatibility mode, an isolated ']' causes an error. In |
| 2693 |
|
default (Perl) mode, it is treated as a data character. */ |
| 2694 |
|
|
| 2695 |
|
case ']': |
| 2696 |
|
if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| 2697 |
|
{ |
| 2698 |
|
*errorcodeptr = ERR64; |
| 2699 |
|
goto FAILED; |
| 2700 |
|
} |
| 2701 |
|
goto NORMAL_CHAR; |
| 2702 |
|
|
| 2703 |
case '[': |
case '[': |
| 2704 |
previous = code; |
previous = code; |
| 2732 |
else break; |
else break; |
| 2733 |
} |
} |
| 2734 |
|
|
| 2735 |
|
/* Empty classes are allowed in JavaScript compatibility mode. Otherwise, |
| 2736 |
|
an initial ']' is taken as a data character -- the code below handles |
| 2737 |
|
that. In JS mode, [] must always fail, so generate OP_FAIL, whereas |
| 2738 |
|
[^] must match any character, so generate OP_ALLANY. */ |
| 2739 |
|
|
| 2740 |
|
if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| 2741 |
|
{ |
| 2742 |
|
*code++ = negate_class? OP_ALLANY : OP_FAIL; |
| 2743 |
|
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
| 2744 |
|
zerofirstbyte = firstbyte; |
| 2745 |
|
break; |
| 2746 |
|
} |
| 2747 |
|
|
| 2748 |
/* If a class contains a negative special such as \S, we need to flip the |
/* If a class contains a negative special such as \S, we need to flip the |
| 2749 |
negation flag at the end, so that support for characters > 255 works |
negation flag at the end, so that support for characters > 255 works |
| 2750 |
correctly (they are all included in the class). */ |
correctly (they are all included in the class). */ |
| 3345 |
if ((options & PCRE_CASELESS) != 0) |
if ((options & PCRE_CASELESS) != 0) |
| 3346 |
{ |
{ |
| 3347 |
unsigned int othercase; |
unsigned int othercase; |
| 3348 |
if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) |
if ((othercase = UCD_OTHERCASE(c)) != c) |
| 3349 |
{ |
{ |
| 3350 |
*class_utf8data++ = XCL_SINGLE; |
*class_utf8data++ = XCL_SINGLE; |
| 3351 |
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); |
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); |
| 3900 |
|
|
| 3901 |
if (repeat_min == 0) |
if (repeat_min == 0) |
| 3902 |
{ |
{ |
| 3903 |
/* If the maximum is also zero, we just omit the group from the output |
/* If the maximum is also zero, we used to just omit the group from the |
| 3904 |
altogether. */ |
output altogether, like this: |
|
|
|
|
if (repeat_max == 0) |
|
|
{ |
|
|
code = previous; |
|
|
goto END_REPEAT; |
|
|
} |
|
| 3905 |
|
|
| 3906 |
/* If the maximum is 1 or unlimited, we just have to stick in the |
** if (repeat_max == 0) |
| 3907 |
BRAZERO and do no more at this point. However, we do need to adjust |
** { |
| 3908 |
any OP_RECURSE calls inside the group that refer to the group itself or |
** code = previous; |
| 3909 |
any internal or forward referenced group, because the offset is from |
** goto END_REPEAT; |
| 3910 |
the start of the whole regex. Temporarily terminate the pattern while |
** } |
| 3911 |
doing this. */ |
|
| 3912 |
|
However, that fails when a group is referenced as a subroutine from |
| 3913 |
|
elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it |
| 3914 |
|
so that it is skipped on execution. As we don't have a list of which |
| 3915 |
|
groups are referenced, we cannot do this selectively. |
| 3916 |
|
|
| 3917 |
|
If the maximum is 1 or unlimited, we just have to stick in the BRAZERO |
| 3918 |
|
and do no more at this point. However, we do need to adjust any |
| 3919 |
|
OP_RECURSE calls inside the group that refer to the group itself or any |
| 3920 |
|
internal or forward referenced group, because the offset is from the |
| 3921 |
|
start of the whole regex. Temporarily terminate the pattern while doing |
| 3922 |
|
this. */ |
| 3923 |
|
|
| 3924 |
if (repeat_max <= 1) |
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ |
| 3925 |
{ |
{ |
| 3926 |
*code = OP_END; |
*code = OP_END; |
| 3927 |
adjust_recurse(previous, 1, utf8, cd, save_hwm); |
adjust_recurse(previous, 1, utf8, cd, save_hwm); |
| 3928 |
memmove(previous+1, previous, len); |
memmove(previous+1, previous, len); |
| 3929 |
code++; |
code++; |
| 3930 |
|
if (repeat_max == 0) |
| 3931 |
|
{ |
| 3932 |
|
*previous++ = OP_SKIPZERO; |
| 3933 |
|
goto END_REPEAT; |
| 3934 |
|
} |
| 3935 |
*previous++ = OP_BRAZERO + repeat_type; |
*previous++ = OP_BRAZERO + repeat_type; |
| 3936 |
} |
} |
| 3937 |
|
|
| 4126 |
} |
} |
| 4127 |
} |
} |
| 4128 |
|
|
| 4129 |
|
/* If previous is OP_FAIL, it was generated by an empty class [] in |
| 4130 |
|
JavaScript mode. The other ways in which OP_FAIL can be generated, that is |
| 4131 |
|
by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" |
| 4132 |
|
error above. We can just ignore the repeat in JS case. */ |
| 4133 |
|
|
| 4134 |
|
else if (*previous == OP_FAIL) goto END_REPEAT; |
| 4135 |
|
|
| 4136 |
/* Else there's some kind of shambles */ |
/* Else there's some kind of shambles */ |
| 4137 |
|
|
| 4138 |
else |
else |
| 4222 |
const char *vn = verbnames; |
const char *vn = verbnames; |
| 4223 |
const uschar *name = ++ptr; |
const uschar *name = ++ptr; |
| 4224 |
previous = NULL; |
previous = NULL; |
| 4225 |
while ((cd->ctypes[*++ptr] & ctype_letter) != 0); |
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; |
| 4226 |
if (*ptr == ':') |
if (*ptr == ':') |
| 4227 |
{ |
{ |
| 4228 |
*errorcodeptr = ERR59; /* Not supported */ |
*errorcodeptr = ERR59; /* Not supported */ |
| 4419 |
|
|
| 4420 |
/* Search the pattern for a forward reference */ |
/* Search the pattern for a forward reference */ |
| 4421 |
|
|
| 4422 |
else if ((i = find_parens(ptr, cd->bracount, name, namelen, |
else if ((i = find_parens(ptr, cd, name, namelen, |
| 4423 |
(options & PCRE_EXTENDED) != 0)) > 0) |
(options & PCRE_EXTENDED) != 0)) > 0) |
| 4424 |
{ |
{ |
| 4425 |
PUT2(code, 2+LINK_SIZE, i); |
PUT2(code, 2+LINK_SIZE, i); |
| 4716 |
recno = GET2(slot, 0); |
recno = GET2(slot, 0); |
| 4717 |
} |
} |
| 4718 |
else if ((recno = /* Forward back reference */ |
else if ((recno = /* Forward back reference */ |
| 4719 |
find_parens(ptr, cd->bracount, name, namelen, |
find_parens(ptr, cd, name, namelen, |
| 4720 |
(options & PCRE_EXTENDED) != 0)) <= 0) |
(options & PCRE_EXTENDED) != 0)) <= 0) |
| 4721 |
{ |
{ |
| 4722 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 4744 |
{ |
{ |
| 4745 |
const uschar *called; |
const uschar *called; |
| 4746 |
terminator = ')'; |
terminator = ')'; |
| 4747 |
|
|
| 4748 |
/* Come here from the \g<...> and \g'...' code (Oniguruma |
/* Come here from the \g<...> and \g'...' code (Oniguruma |
| 4749 |
compatibility). However, the syntax has been checked to ensure that |
compatibility). However, the syntax has been checked to ensure that |
| 4750 |
the ... are a (signed) number, so that neither ERR63 nor ERR29 will |
the ... are a (signed) number, so that neither ERR63 nor ERR29 will |
| 4751 |
be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY |
be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY |
| 4752 |
ever be taken. */ |
ever be taken. */ |
| 4753 |
|
|
| 4754 |
HANDLE_NUMERICAL_RECURSION: |
HANDLE_NUMERICAL_RECURSION: |
| 4755 |
|
|
| 4756 |
if ((refsign = *ptr) == '+') |
if ((refsign = *ptr) == '+') |
| 4757 |
{ |
{ |
| 4826 |
|
|
| 4827 |
if (called == NULL) |
if (called == NULL) |
| 4828 |
{ |
{ |
| 4829 |
if (find_parens(ptr, cd->bracount, NULL, recno, |
if (find_parens(ptr, cd, NULL, recno, |
| 4830 |
(options & PCRE_EXTENDED) != 0) < 0) |
(options & PCRE_EXTENDED) != 0) < 0) |
| 4831 |
{ |
{ |
| 4832 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
| 4833 |
goto FAILED; |
goto FAILED; |
| 4926 |
both phases. |
both phases. |
| 4927 |
|
|
| 4928 |
If we are not at the pattern start, compile code to change the ims |
If we are not at the pattern start, compile code to change the ims |
| 4929 |
options if this setting actually changes any of them. We also pass the |
options if this setting actually changes any of them, and reset the |
| 4930 |
new setting back so that it can be put at the start of any following |
greedy defaults and the case value for firstbyte and reqbyte. */ |
|
branches, and when this group ends (if we are in a group), a resetting |
|
|
item can be compiled. */ |
|
| 4931 |
|
|
| 4932 |
if (*ptr == ')') |
if (*ptr == ')') |
| 4933 |
{ |
{ |
| 4935 |
(lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) |
(lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) |
| 4936 |
{ |
{ |
| 4937 |
cd->external_options = newoptions; |
cd->external_options = newoptions; |
|
options = newoptions; |
|
| 4938 |
} |
} |
| 4939 |
else |
else |
| 4940 |
{ |
{ |
| 4943 |
*code++ = OP_OPT; |
*code++ = OP_OPT; |
| 4944 |
*code++ = newoptions & PCRE_IMS; |
*code++ = newoptions & PCRE_IMS; |
| 4945 |
} |
} |
|
|
|
|
/* Change options at this level, and pass them back for use |
|
|
in subsequent branches. Reset the greedy defaults and the case |
|
|
value for firstbyte and reqbyte. */ |
|
|
|
|
|
*optionsptr = options = newoptions; |
|
| 4946 |
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); |
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); |
| 4947 |
greedy_non_default = greedy_default ^ 1; |
greedy_non_default = greedy_default ^ 1; |
| 4948 |
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; |
req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; |
| 4949 |
} |
} |
| 4950 |
|
|
| 4951 |
|
/* Change options at this level, and pass them back for use |
| 4952 |
|
in subsequent branches. When not at the start of the pattern, this |
| 4953 |
|
information is also necessary so that a resetting item can be |
| 4954 |
|
compiled at the end of a group (if we are in a group). */ |
| 4955 |
|
|
| 4956 |
|
*optionsptr = options = newoptions; |
| 4957 |
previous = NULL; /* This item can't be repeated */ |
previous = NULL; /* This item can't be repeated */ |
| 4958 |
continue; /* It is complete */ |
continue; /* It is complete */ |
| 4959 |
} |
} |
| 5166 |
back references and those types that consume a character may be repeated. |
back references and those types that consume a character may be repeated. |
| 5167 |
We can test for values between ESC_b and ESC_Z for the latter; this may |
We can test for values between ESC_b and ESC_Z for the latter; this may |
| 5168 |
have to change if any new ones are ever created. */ |
have to change if any new ones are ever created. */ |
| 5169 |
|
|
| 5170 |
case '\\': |
case '\\': |
| 5171 |
tempptr = ptr; |
tempptr = ptr; |
| 5172 |
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); |
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); |
| 5193 |
|
|
| 5194 |
zerofirstbyte = firstbyte; |
zerofirstbyte = firstbyte; |
| 5195 |
zeroreqbyte = reqbyte; |
zeroreqbyte = reqbyte; |
| 5196 |
|
|
| 5197 |
/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' |
/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' |
| 5198 |
is a subroutine call by number (Oniguruma syntax). In fact, the value |
is a subroutine call by number (Oniguruma syntax). In fact, the value |
| 5199 |
-ESC_g is returned only for these cases. So we don't need to check for < |
-ESC_g is returned only for these cases. So we don't need to check for < |
| 5200 |
or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is |
or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is |
| 5201 |
-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as |
-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as |
| 5202 |
that is a synonym). */ |
that is a synonym for a named back reference). */ |
| 5203 |
|
|
| 5204 |
if (-c == ESC_g) |
if (-c == ESC_g) |
| 5205 |
{ |
{ |
| 5206 |
const uschar *p; |
const uschar *p; |
| 5207 |
|
save_hwm = cd->hwm; /* Normally this is set when '(' is read */ |
| 5208 |
terminator = (*(++ptr) == '<')? '>' : '\''; |
terminator = (*(++ptr) == '<')? '>' : '\''; |
| 5209 |
|
|
| 5210 |
/* These two statements stop the compiler for warning about possibly |
/* These two statements stop the compiler for warning about possibly |
| 5211 |
unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In |
unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In |
| 5212 |
fact, because we actually check for a number below, the paths that |
fact, because we actually check for a number below, the paths that |
| 5213 |
would actually be in error are never taken. */ |
would actually be in error are never taken. */ |
| 5214 |
|
|
| 5215 |
skipbytes = 0; |
skipbytes = 0; |
| 5216 |
reset_bracount = FALSE; |
reset_bracount = FALSE; |
| 5217 |
|
|
| 5218 |
/* Test for a name */ |
/* Test for a name */ |
| 5219 |
|
|
| 5220 |
if (ptr[1] != '+' && ptr[1] != '-') |
if (ptr[1] != '+' && ptr[1] != '-') |
| 5221 |
{ |
{ |
| 5222 |
BOOL isnumber = TRUE; |
BOOL isnumber = TRUE; |
| 5223 |
for (p = ptr + 1; *p != 0 && *p != terminator; p++) |
for (p = ptr + 1; *p != 0 && *p != terminator; p++) |
| 5224 |
{ |
{ |
| 5225 |
if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; |
if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; |
| 5226 |
if ((cd->ctypes[*p] & ctype_word) == 0) break; |
if ((cd->ctypes[*p] & ctype_word) == 0) break; |
| 5227 |
} |
} |
| 5228 |
if (*p != terminator) |
if (*p != terminator) |
| 5229 |
{ |
{ |
| 5230 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 5231 |
break; |
break; |
| 5232 |
} |
} |
| 5233 |
if (isnumber) |
if (isnumber) |
| 5234 |
{ |
{ |
| 5235 |
ptr++; |
ptr++; |
| 5236 |
goto HANDLE_NUMERICAL_RECURSION; |
goto HANDLE_NUMERICAL_RECURSION; |
| 5237 |
} |
} |
| 5238 |
is_recurse = TRUE; |
is_recurse = TRUE; |
| 5239 |
goto NAMED_REF_OR_RECURSE; |
goto NAMED_REF_OR_RECURSE; |
| 5240 |
} |
} |
| 5241 |
|
|
| 5242 |
/* Test a signed number in angle brackets or quotes. */ |
/* Test a signed number in angle brackets or quotes. */ |
| 5243 |
|
|
| 5244 |
p = ptr + 2; |
p = ptr + 2; |
| 5245 |
while ((digitab[*p] & ctype_digit) != 0) p++; |
while ((digitab[*p] & ctype_digit) != 0) p++; |
| 5246 |
if (*p != terminator) |
if (*p != terminator) |
| 5248 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 5249 |
break; |
break; |
| 5250 |
} |
} |
| 5251 |
ptr++; |
ptr++; |
| 5252 |
goto HANDLE_NUMERICAL_RECURSION; |
goto HANDLE_NUMERICAL_RECURSION; |
| 5253 |
} |
} |
| 5254 |
|
|
| 5255 |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
| 5256 |
We also support \k{name} (.NET syntax) */ |
We also support \k{name} (.NET syntax) */ |
| 5758 |
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; |
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; |
| 5759 |
} |
} |
| 5760 |
|
|
| 5761 |
/* .* is not anchored unless DOTALL is set and it isn't in brackets that |
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and |
| 5762 |
are or may be referenced. */ |
it isn't in brackets that are or may be referenced. */ |
| 5763 |
|
|
| 5764 |
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
| 5765 |
op == OP_TYPEPOSSTAR) && |
op == OP_TYPEPOSSTAR)) |
|
(*options & PCRE_DOTALL) != 0) |
|
| 5766 |
{ |
{ |
| 5767 |
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; |
if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) |
| 5768 |
|
return FALSE; |
| 5769 |
} |
} |
| 5770 |
|
|
| 5771 |
/* Check for explicit anchoring */ |
/* Check for explicit anchoring */ |
| 5810 |
const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], |
const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], |
| 5811 |
NULL, 0, FALSE); |
NULL, 0, FALSE); |
| 5812 |
register int op = *scode; |
register int op = *scode; |
| 5813 |
|
|
| 5814 |
|
/* If we are at the start of a conditional assertion group, *both* the |
| 5815 |
|
conditional assertion *and* what follows the condition must satisfy the test |
| 5816 |
|
for start of line. Other kinds of condition fail. Note that there may be an |
| 5817 |
|
auto-callout at the start of a condition. */ |
| 5818 |
|
|
| 5819 |
|
if (op == OP_COND) |
| 5820 |
|
{ |
| 5821 |
|
scode += 1 + LINK_SIZE; |
| 5822 |
|
if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT]; |
| 5823 |
|
switch (*scode) |
| 5824 |
|
{ |
| 5825 |
|
case OP_CREF: |
| 5826 |
|
case OP_RREF: |
| 5827 |
|
case OP_DEF: |
| 5828 |
|
return FALSE; |
| 5829 |
|
|
| 5830 |
|
default: /* Assertion */ |
| 5831 |
|
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
| 5832 |
|
do scode += GET(scode, 1); while (*scode == OP_ALT); |
| 5833 |
|
scode += 1 + LINK_SIZE; |
| 5834 |
|
break; |
| 5835 |
|
} |
| 5836 |
|
scode = first_significant_code(scode, NULL, 0, FALSE); |
| 5837 |
|
op = *scode; |
| 5838 |
|
} |
| 5839 |
|
|
| 5840 |
/* Non-capturing brackets */ |
/* Non-capturing brackets */ |
| 5841 |
|
|
| 5855 |
|
|
| 5856 |
/* Other brackets */ |
/* Other brackets */ |
| 5857 |
|
|
| 5858 |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
else if (op == OP_ASSERT || op == OP_ONCE) |
| 5859 |
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } |
{ |
| 5860 |
|
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
| 5861 |
|
} |
| 5862 |
|
|
| 5863 |
/* .* means "start at start or after \n" if it isn't in brackets that |
/* .* means "start at start or after \n" if it isn't in brackets that |
| 5864 |
may be referenced. */ |
may be referenced. */ |
| 5975 |
with errorptr and erroroffset set |
with errorptr and erroroffset set |
| 5976 |
*/ |
*/ |
| 5977 |
|
|
| 5978 |
PCRE_EXP_DEFN pcre * |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
| 5979 |
pcre_compile(const char *pattern, int options, const char **errorptr, |
pcre_compile(const char *pattern, int options, const char **errorptr, |
| 5980 |
int *erroroffset, const unsigned char *tables) |
int *erroroffset, const unsigned char *tables) |
| 5981 |
{ |
{ |
| 5983 |
} |
} |
| 5984 |
|
|
| 5985 |
|
|
| 5986 |
PCRE_EXP_DEFN pcre * |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
| 5987 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
| 5988 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
| 5989 |
{ |
{ |
| 6055 |
} |
} |
| 6056 |
#endif |
#endif |
| 6057 |
|
|
| 6058 |
if ((options & ~PUBLIC_OPTIONS) != 0) |
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) |
| 6059 |
{ |
{ |
| 6060 |
errorcode = ERR17; |
errorcode = ERR17; |
| 6061 |
goto PCRE_EARLY_ERROR_RETURN; |
goto PCRE_EARLY_ERROR_RETURN; |