| 158 |
"SKIP\0" |
"SKIP\0" |
| 159 |
"THEN"; |
"THEN"; |
| 160 |
|
|
| 161 |
static verbitem verbs[] = { |
static const verbitem verbs[] = { |
| 162 |
{ 6, OP_ACCEPT }, |
{ 6, OP_ACCEPT }, |
| 163 |
{ 6, OP_COMMIT }, |
{ 6, OP_COMMIT }, |
| 164 |
{ 1, OP_FAIL }, |
{ 1, OP_FAIL }, |
| 168 |
{ 4, OP_THEN } |
{ 4, OP_THEN } |
| 169 |
}; |
}; |
| 170 |
|
|
| 171 |
static int verbcount = sizeof(verbs)/sizeof(verbitem); |
static const int verbcount = sizeof(verbs)/sizeof(verbitem); |
| 172 |
|
|
| 173 |
|
|
| 174 |
/* Tables of names of POSIX character classes and their lengths. The names are |
/* Tables of names of POSIX character classes and their lengths. The names are |
| 295 |
/* 55 */ |
/* 55 */ |
| 296 |
"repeating a DEFINE group is not allowed\0" |
"repeating a DEFINE group is not allowed\0" |
| 297 |
"inconsistent NEWLINE options\0" |
"inconsistent NEWLINE options\0" |
| 298 |
"\\g is not followed by a braced name or an optionally braced non-zero number\0" |
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" |
| 299 |
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" |
"a numbered reference must not be zero\0" |
| 300 |
"(*VERB) with an argument is not supported\0" |
"(*VERB) with an argument is not supported\0" |
| 301 |
/* 60 */ |
/* 60 */ |
| 302 |
"(*VERB) not recognized\0" |
"(*VERB) not recognized\0" |
| 531 |
*errorcodeptr = ERR37; |
*errorcodeptr = ERR37; |
| 532 |
break; |
break; |
| 533 |
|
|
| 534 |
/* \g must be followed by a number, either plain or braced. If positive, it |
/* \g must be followed by one of a number of specific things: |
| 535 |
is an absolute backreference. If negative, it is a relative backreference. |
|
| 536 |
This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a |
(1) A number, either plain or braced. If positive, it is an absolute |
| 537 |
reference to a named group. This is part of Perl's movement towards a |
backreference. If negative, it is a relative backreference. This is a Perl |
| 538 |
unified syntax for back references. As this is synonymous with \k{name}, we |
5.10 feature. |
| 539 |
fudge it up by pretending it really was \k. */ |
|
| 540 |
|
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
| 541 |
|
is part of Perl's movement towards a unified syntax for back references. As |
| 542 |
|
this is synonymous with \k{name}, we fudge it up by pretending it really |
| 543 |
|
was \k. |
| 544 |
|
|
| 545 |
|
(3) For Oniguruma compatibility we also support \g followed by a name or a |
| 546 |
|
number either in angle brackets or in single quotes. However, these are |
| 547 |
|
(possibly recursive) subroutine calls, _not_ backreferences. Just return |
| 548 |
|
the -ESC_g code (cf \k). */ |
| 549 |
|
|
| 550 |
case 'g': |
case 'g': |
| 551 |
|
if (ptr[1] == '<' || ptr[1] == '\'') |
| 552 |
|
{ |
| 553 |
|
c = -ESC_g; |
| 554 |
|
break; |
| 555 |
|
} |
| 556 |
|
|
| 557 |
|
/* Handle the Perl-compatible cases */ |
| 558 |
|
|
| 559 |
if (ptr[1] == '{') |
if (ptr[1] == '{') |
| 560 |
{ |
{ |
| 561 |
const uschar *p; |
const uschar *p; |
| 582 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 583 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 584 |
|
|
| 585 |
if (c < 0) |
if (c < 0) /* Integer overflow */ |
| 586 |
{ |
{ |
| 587 |
*errorcodeptr = ERR61; |
*errorcodeptr = ERR61; |
| 588 |
break; |
break; |
| 589 |
} |
} |
| 590 |
|
|
| 591 |
if (c == 0 || (braced && *(++ptr) != '}')) |
if (braced && *(++ptr) != '}') |
| 592 |
{ |
{ |
| 593 |
*errorcodeptr = ERR57; |
*errorcodeptr = ERR57; |
| 594 |
break; |
break; |
| 595 |
} |
} |
| 596 |
|
|
| 597 |
|
if (c == 0) |
| 598 |
|
{ |
| 599 |
|
*errorcodeptr = ERR58; |
| 600 |
|
break; |
| 601 |
|
} |
| 602 |
|
|
| 603 |
if (negated) |
if (negated) |
| 604 |
{ |
{ |
| 634 |
c -= '0'; |
c -= '0'; |
| 635 |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
while ((digitab[ptr[1]] & ctype_digit) != 0) |
| 636 |
c = c * 10 + *(++ptr) - '0'; |
c = c * 10 + *(++ptr) - '0'; |
| 637 |
if (c < 0) |
if (c < 0) /* Integer overflow */ |
| 638 |
{ |
{ |
| 639 |
*errorcodeptr = ERR61; |
*errorcodeptr = ERR61; |
| 640 |
break; |
break; |
| 1567 |
|
|
| 1568 |
/* Groups with zero repeats can of course be empty; skip them. */ |
/* Groups with zero repeats can of course be empty; skip them. */ |
| 1569 |
|
|
| 1570 |
if (c == OP_BRAZERO || c == OP_BRAMINZERO) |
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) |
| 1571 |
{ |
{ |
| 1572 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1573 |
do code += GET(code, 1); while (*code == OP_ALT); |
do code += GET(code, 1); while (*code == OP_ALT); |
| 1847 |
that is referenced. This means that groups can be replicated for fixed |
that is referenced. This means that groups can be replicated for fixed |
| 1848 |
repetition simply by copying (because the recursion is allowed to refer to |
repetition simply by copying (because the recursion is allowed to refer to |
| 1849 |
earlier groups that are outside the current group). However, when a group is |
earlier groups that are outside the current group). However, when a group is |
| 1850 |
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before |
optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is |
| 1851 |
it, after it has been compiled. This means that any OP_RECURSE items within it |
inserted before it, after it has been compiled. This means that any OP_RECURSE |
| 1852 |
that refer to the group itself or any contained groups have to have their |
items within it that refer to the group itself or any contained groups have to |
| 1853 |
offsets adjusted. That one of the jobs of this function. Before it is called, |
have their offsets adjusted. That one of the jobs of this function. Before it |
| 1854 |
the partially compiled regex must be temporarily terminated with OP_END. |
is called, the partially compiled regex must be temporarily terminated with |
| 1855 |
|
OP_END. |
| 1856 |
|
|
| 1857 |
This function has been extended with the possibility of forward references for |
This function has been extended with the possibility of forward references for |
| 1858 |
recursions and subroutine calls. It must also check the list of such references |
recursions and subroutine calls. It must also check the list of such references |
| 1883 |
|
|
| 1884 |
/* See if this recursion is on the forward reference list. If so, adjust the |
/* See if this recursion is on the forward reference list. If so, adjust the |
| 1885 |
reference. */ |
reference. */ |
| 1886 |
|
|
| 1887 |
for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) |
for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) |
| 1888 |
{ |
{ |
| 1889 |
offset = GET(hc, 0); |
offset = GET(hc, 0); |
| 2137 |
/* For OP_NOT, "item" must be a single-byte character. */ |
/* For OP_NOT, "item" must be a single-byte character. */ |
| 2138 |
|
|
| 2139 |
case OP_NOT: |
case OP_NOT: |
|
if (next < 0) return FALSE; /* Not a character */ |
|
| 2140 |
if (item == next) return TRUE; |
if (item == next) return TRUE; |
| 2141 |
if ((options & PCRE_CASELESS) == 0) return FALSE; |
if ((options & PCRE_CASELESS) == 0) return FALSE; |
| 2142 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 2459 |
/* Get next byte in the pattern */ |
/* Get next byte in the pattern */ |
| 2460 |
|
|
| 2461 |
c = *ptr; |
c = *ptr; |
| 2462 |
|
|
| 2463 |
/* If we are in the pre-compile phase, accumulate the length used for the |
/* If we are in the pre-compile phase, accumulate the length used for the |
| 2464 |
previous cycle of this loop. */ |
previous cycle of this loop. */ |
| 2465 |
|
|
| 3843 |
|
|
| 3844 |
if (repeat_min == 0) |
if (repeat_min == 0) |
| 3845 |
{ |
{ |
| 3846 |
/* If the maximum is also zero, we just omit the group from the output |
/* If the maximum is also zero, we used to just omit the group from the |
| 3847 |
altogether. */ |
output altogether, like this: |
|
|
|
|
if (repeat_max == 0) |
|
|
{ |
|
|
code = previous; |
|
|
goto END_REPEAT; |
|
|
} |
|
| 3848 |
|
|
| 3849 |
/* If the maximum is 1 or unlimited, we just have to stick in the |
** if (repeat_max == 0) |
| 3850 |
BRAZERO and do no more at this point. However, we do need to adjust |
** { |
| 3851 |
any OP_RECURSE calls inside the group that refer to the group itself or |
** code = previous; |
| 3852 |
any internal or forward referenced group, because the offset is from |
** goto END_REPEAT; |
| 3853 |
the start of the whole regex. Temporarily terminate the pattern while |
** } |
| 3854 |
doing this. */ |
|
| 3855 |
|
However, that fails when a group is referenced as a subroutine from |
| 3856 |
|
elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it |
| 3857 |
|
so that it is skipped on execution. As we don't have a list of which |
| 3858 |
|
groups are referenced, we cannot do this selectively. |
| 3859 |
|
|
| 3860 |
|
If the maximum is 1 or unlimited, we just have to stick in the BRAZERO |
| 3861 |
|
and do no more at this point. However, we do need to adjust any |
| 3862 |
|
OP_RECURSE calls inside the group that refer to the group itself or any |
| 3863 |
|
internal or forward referenced group, because the offset is from the |
| 3864 |
|
start of the whole regex. Temporarily terminate the pattern while doing |
| 3865 |
|
this. */ |
| 3866 |
|
|
| 3867 |
if (repeat_max <= 1) |
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ |
| 3868 |
{ |
{ |
| 3869 |
*code = OP_END; |
*code = OP_END; |
| 3870 |
adjust_recurse(previous, 1, utf8, cd, save_hwm); |
adjust_recurse(previous, 1, utf8, cd, save_hwm); |
| 3871 |
memmove(previous+1, previous, len); |
memmove(previous+1, previous, len); |
| 3872 |
code++; |
code++; |
| 3873 |
|
if (repeat_max == 0) |
| 3874 |
|
{ |
| 3875 |
|
*previous++ = OP_SKIPZERO; |
| 3876 |
|
goto END_REPEAT; |
| 3877 |
|
} |
| 3878 |
*previous++ = OP_BRAZERO + repeat_type; |
*previous++ = OP_BRAZERO + repeat_type; |
| 3879 |
} |
} |
| 3880 |
|
|
| 4149 |
bravalue = OP_CBRA; |
bravalue = OP_CBRA; |
| 4150 |
save_hwm = cd->hwm; |
save_hwm = cd->hwm; |
| 4151 |
reset_bracount = FALSE; |
reset_bracount = FALSE; |
| 4152 |
|
|
| 4153 |
/* First deal with various "verbs" that can be introduced by '*'. */ |
/* First deal with various "verbs" that can be introduced by '*'. */ |
| 4154 |
|
|
| 4155 |
if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) |
if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) |
| 4601 |
references (?P=name) and recursion (?P>name), as well as falling |
references (?P=name) and recursion (?P>name), as well as falling |
| 4602 |
through from the Perl recursion syntax (?&name). We also come here from |
through from the Perl recursion syntax (?&name). We also come here from |
| 4603 |
the Perl \k<name> or \k'name' back reference syntax and the \k{name} |
the Perl \k<name> or \k'name' back reference syntax and the \k{name} |
| 4604 |
.NET syntax. */ |
.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ |
| 4605 |
|
|
| 4606 |
NAMED_REF_OR_RECURSE: |
NAMED_REF_OR_RECURSE: |
| 4607 |
name = ++ptr; |
name = ++ptr; |
| 4679 |
case '5': case '6': case '7': case '8': case '9': /* subroutine */ |
case '5': case '6': case '7': case '8': case '9': /* subroutine */ |
| 4680 |
{ |
{ |
| 4681 |
const uschar *called; |
const uschar *called; |
| 4682 |
|
terminator = ')'; |
| 4683 |
|
|
| 4684 |
|
/* Come here from the \g<...> and \g'...' code (Oniguruma |
| 4685 |
|
compatibility). However, the syntax has been checked to ensure that |
| 4686 |
|
the ... are a (signed) number, so that neither ERR63 nor ERR29 will |
| 4687 |
|
be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY |
| 4688 |
|
ever be taken. */ |
| 4689 |
|
|
| 4690 |
|
HANDLE_NUMERICAL_RECURSION: |
| 4691 |
|
|
| 4692 |
if ((refsign = *ptr) == '+') |
if ((refsign = *ptr) == '+') |
| 4693 |
{ |
{ |
| 4709 |
while((digitab[*ptr] & ctype_digit) != 0) |
while((digitab[*ptr] & ctype_digit) != 0) |
| 4710 |
recno = recno * 10 + *ptr++ - '0'; |
recno = recno * 10 + *ptr++ - '0'; |
| 4711 |
|
|
| 4712 |
if (*ptr != ')') |
if (*ptr != terminator) |
| 4713 |
{ |
{ |
| 4714 |
*errorcodeptr = ERR29; |
*errorcodeptr = ERR29; |
| 4715 |
goto FAILED; |
goto FAILED; |
| 5105 |
back references and those types that consume a character may be repeated. |
back references and those types that consume a character may be repeated. |
| 5106 |
We can test for values between ESC_b and ESC_Z for the latter; this may |
We can test for values between ESC_b and ESC_Z for the latter; this may |
| 5107 |
have to change if any new ones are ever created. */ |
have to change if any new ones are ever created. */ |
| 5108 |
|
|
| 5109 |
case '\\': |
case '\\': |
| 5110 |
tempptr = ptr; |
tempptr = ptr; |
| 5111 |
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); |
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); |
| 5132 |
|
|
| 5133 |
zerofirstbyte = firstbyte; |
zerofirstbyte = firstbyte; |
| 5134 |
zeroreqbyte = reqbyte; |
zeroreqbyte = reqbyte; |
| 5135 |
|
|
| 5136 |
|
/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' |
| 5137 |
|
is a subroutine call by number (Oniguruma syntax). In fact, the value |
| 5138 |
|
-ESC_g is returned only for these cases. So we don't need to check for < |
| 5139 |
|
or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is |
| 5140 |
|
-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as |
| 5141 |
|
that is a synonym for a named back reference). */ |
| 5142 |
|
|
| 5143 |
|
if (-c == ESC_g) |
| 5144 |
|
{ |
| 5145 |
|
const uschar *p; |
| 5146 |
|
save_hwm = cd->hwm; /* Normally this is set when '(' is read */ |
| 5147 |
|
terminator = (*(++ptr) == '<')? '>' : '\''; |
| 5148 |
|
|
| 5149 |
|
/* These two statements stop the compiler for warning about possibly |
| 5150 |
|
unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In |
| 5151 |
|
fact, because we actually check for a number below, the paths that |
| 5152 |
|
would actually be in error are never taken. */ |
| 5153 |
|
|
| 5154 |
|
skipbytes = 0; |
| 5155 |
|
reset_bracount = FALSE; |
| 5156 |
|
|
| 5157 |
|
/* Test for a name */ |
| 5158 |
|
|
| 5159 |
|
if (ptr[1] != '+' && ptr[1] != '-') |
| 5160 |
|
{ |
| 5161 |
|
BOOL isnumber = TRUE; |
| 5162 |
|
for (p = ptr + 1; *p != 0 && *p != terminator; p++) |
| 5163 |
|
{ |
| 5164 |
|
if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; |
| 5165 |
|
if ((cd->ctypes[*p] & ctype_word) == 0) break; |
| 5166 |
|
} |
| 5167 |
|
if (*p != terminator) |
| 5168 |
|
{ |
| 5169 |
|
*errorcodeptr = ERR57; |
| 5170 |
|
break; |
| 5171 |
|
} |
| 5172 |
|
if (isnumber) |
| 5173 |
|
{ |
| 5174 |
|
ptr++; |
| 5175 |
|
goto HANDLE_NUMERICAL_RECURSION; |
| 5176 |
|
} |
| 5177 |
|
is_recurse = TRUE; |
| 5178 |
|
goto NAMED_REF_OR_RECURSE; |
| 5179 |
|
} |
| 5180 |
|
|
| 5181 |
|
/* Test a signed number in angle brackets or quotes. */ |
| 5182 |
|
|
| 5183 |
|
p = ptr + 2; |
| 5184 |
|
while ((digitab[*p] & ctype_digit) != 0) p++; |
| 5185 |
|
if (*p != terminator) |
| 5186 |
|
{ |
| 5187 |
|
*errorcodeptr = ERR57; |
| 5188 |
|
break; |
| 5189 |
|
} |
| 5190 |
|
ptr++; |
| 5191 |
|
goto HANDLE_NUMERICAL_RECURSION; |
| 5192 |
|
} |
| 5193 |
|
|
| 5194 |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
| 5195 |
We also support \k{name} (.NET syntax) */ |
We also support \k{name} (.NET syntax) */ |
| 6209 |
if (groupptr == NULL) errorcode = ERR53; |
if (groupptr == NULL) errorcode = ERR53; |
| 6210 |
else PUT(((uschar *)codestart), offset, groupptr - codestart); |
else PUT(((uschar *)codestart), offset, groupptr - codestart); |
| 6211 |
} |
} |
| 6212 |
|
|
| 6213 |
/* Give an error if there's back reference to a non-existent capturing |
/* Give an error if there's back reference to a non-existent capturing |
| 6214 |
subpattern. */ |
subpattern. */ |
| 6215 |
|
|