| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2007 University of Cambridge |
Copyright (c) 1997-2008 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 1737 |
*************************************************/ |
*************************************************/ |
| 1738 |
|
|
| 1739 |
/* This function is called when the sequence "[:" or "[." or "[=" is |
/* This function is called when the sequence "[:" or "[." or "[=" is |
| 1740 |
encountered in a character class. It checks whether this is followed by an |
encountered in a character class. It checks whether this is followed by a |
| 1741 |
optional ^ and then a sequence of letters, terminated by a matching ":]" or |
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
| 1742 |
".]" or "=]". |
reach an unescaped ']' without the special preceding character, return FALSE. |
| 1743 |
|
|
| 1744 |
|
Originally, this function only recognized a sequence of letters between the |
| 1745 |
|
terminators, but it seems that Perl recognizes any sequence of characters, |
| 1746 |
|
though of course unknown POSIX names are subsequently rejected. Perl gives an |
| 1747 |
|
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
| 1748 |
|
didn't consider this to be a POSIX class. Likewise for [:1234:]. |
| 1749 |
|
|
| 1750 |
|
The problem in trying to be exactly like Perl is in the handling of escapes. We |
| 1751 |
|
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
| 1752 |
|
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
| 1753 |
|
below handles the special case of \], but does not try to do any other escape |
| 1754 |
|
processing. This makes it different from Perl for cases such as [:l\ower:] |
| 1755 |
|
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize |
| 1756 |
|
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
| 1757 |
|
I think. |
| 1758 |
|
|
| 1759 |
Argument: |
Arguments: |
| 1760 |
ptr pointer to the initial [ |
ptr pointer to the initial [ |
| 1761 |
endptr where to return the end pointer |
endptr where to return the end pointer |
|
cd pointer to compile data |
|
| 1762 |
|
|
| 1763 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
| 1764 |
*/ |
*/ |
| 1765 |
|
|
| 1766 |
static BOOL |
static BOOL |
| 1767 |
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) |
check_posix_syntax(const uschar *ptr, const uschar **endptr) |
| 1768 |
{ |
{ |
| 1769 |
int terminator; /* Don't combine these lines; the Solaris cc */ |
int terminator; /* Don't combine these lines; the Solaris cc */ |
| 1770 |
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
| 1771 |
if (*(++ptr) == '^') ptr++; |
for (++ptr; *ptr != 0; ptr++) |
|
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; |
|
|
if (*ptr == terminator && ptr[1] == ']') |
|
| 1772 |
{ |
{ |
| 1773 |
*endptr = ptr; |
if (*ptr == '\\' && ptr[1] == ']') ptr++; else |
| 1774 |
return TRUE; |
{ |
| 1775 |
|
if (*ptr == ']') return FALSE; |
| 1776 |
|
if (*ptr == terminator && ptr[1] == ']') |
| 1777 |
|
{ |
| 1778 |
|
*endptr = ptr; |
| 1779 |
|
return TRUE; |
| 1780 |
|
} |
| 1781 |
|
} |
| 1782 |
} |
} |
| 1783 |
return FALSE; |
return FALSE; |
| 1784 |
} |
} |
| 2113 |
/* For OP_NOT, "item" must be a single-byte character. */ |
/* For OP_NOT, "item" must be a single-byte character. */ |
| 2114 |
|
|
| 2115 |
case OP_NOT: |
case OP_NOT: |
|
if (next < 0) return FALSE; /* Not a character */ |
|
| 2116 |
if (item == next) return TRUE; |
if (item == next) return TRUE; |
| 2117 |
if ((options & PCRE_CASELESS) == 0) return FALSE; |
if ((options & PCRE_CASELESS) == 0) return FALSE; |
| 2118 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 2375 |
BOOL class_utf8; |
BOOL class_utf8; |
| 2376 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
| 2377 |
uschar *class_utf8data; |
uschar *class_utf8data; |
| 2378 |
|
uschar *class_utf8data_base; |
| 2379 |
uschar utf8_char[6]; |
uschar utf8_char[6]; |
| 2380 |
#else |
#else |
| 2381 |
BOOL utf8 = FALSE; |
BOOL utf8 = FALSE; |
| 2639 |
they are encountered at the top level, so we'll do that too. */ |
they are encountered at the top level, so we'll do that too. */ |
| 2640 |
|
|
| 2641 |
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
| 2642 |
check_posix_syntax(ptr, &tempptr, cd)) |
check_posix_syntax(ptr, &tempptr)) |
| 2643 |
{ |
{ |
| 2644 |
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; |
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; |
| 2645 |
goto FAILED; |
goto FAILED; |
| 2687 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 2688 |
class_utf8 = FALSE; /* No chars >= 256 */ |
class_utf8 = FALSE; /* No chars >= 256 */ |
| 2689 |
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ |
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ |
| 2690 |
|
class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ |
| 2691 |
#endif |
#endif |
| 2692 |
|
|
| 2693 |
/* Process characters until ] is reached. By writing this as a "do" it |
/* Process characters until ] is reached. By writing this as a "do" it |
| 2703 |
{ /* Braces are required because the */ |
{ /* Braces are required because the */ |
| 2704 |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
| 2705 |
} |
} |
| 2706 |
|
|
| 2707 |
|
/* In the pre-compile phase, accumulate the length of any UTF-8 extra |
| 2708 |
|
data and reset the pointer. This is so that very large classes that |
| 2709 |
|
contain a zillion UTF-8 characters no longer overwrite the work space |
| 2710 |
|
(which is on the stack). */ |
| 2711 |
|
|
| 2712 |
|
if (lengthptr != NULL) |
| 2713 |
|
{ |
| 2714 |
|
*lengthptr += class_utf8data - class_utf8data_base; |
| 2715 |
|
class_utf8data = class_utf8data_base; |
| 2716 |
|
} |
| 2717 |
|
|
| 2718 |
#endif |
#endif |
| 2719 |
|
|
| 2720 |
/* Inside \Q...\E everything is literal except \E */ |
/* Inside \Q...\E everything is literal except \E */ |
| 2738 |
|
|
| 2739 |
if (c == '[' && |
if (c == '[' && |
| 2740 |
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
| 2741 |
check_posix_syntax(ptr, &tempptr, cd)) |
check_posix_syntax(ptr, &tempptr)) |
| 2742 |
{ |
{ |
| 2743 |
BOOL local_negate = FALSE; |
BOOL local_negate = FALSE; |
| 2744 |
int posix_class, taboffset, tabopt; |
int posix_class, taboffset, tabopt; |
| 5820 |
|
|
| 5821 |
uschar cworkspace[COMPILE_WORK_SIZE]; |
uschar cworkspace[COMPILE_WORK_SIZE]; |
| 5822 |
|
|
|
|
|
| 5823 |
/* Set this early so that early errors get offset 0. */ |
/* Set this early so that early errors get offset 0. */ |
| 5824 |
|
|
| 5825 |
ptr = (const uschar *)pattern; |
ptr = (const uschar *)pattern; |