| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2007 University of Cambridge |
Copyright (c) 1997-2008 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 1738 |
|
|
| 1739 |
/* This function is called when the sequence "[:" or "[." or "[=" is |
/* This function is called when the sequence "[:" or "[." or "[=" is |
| 1740 |
encountered in a character class. It checks whether this is followed by a |
encountered in a character class. It checks whether this is followed by a |
| 1741 |
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
| 1742 |
reach an unescaped ']' without the special preceding character, return FALSE. |
reach an unescaped ']' without the special preceding character, return FALSE. |
| 1743 |
|
|
| 1744 |
Originally, this function only recognized a sequence of letters between the |
Originally, this function only recognized a sequence of letters between the |
| 1745 |
terminators, but it seems that Perl recognizes any sequence of characters, |
terminators, but it seems that Perl recognizes any sequence of characters, |
| 1746 |
though of course unknown POSIX names are subsequently rejected. Perl gives an |
though of course unknown POSIX names are subsequently rejected. Perl gives an |
| 1747 |
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
| 1748 |
didn't consider this to be a POSIX class. Likewise for [:1234:]. |
didn't consider this to be a POSIX class. Likewise for [:1234:]. |
| 1749 |
|
|
| 1750 |
The problem in trying to be exactly like Perl is in the handling of escapes. We |
The problem in trying to be exactly like Perl is in the handling of escapes. We |
| 1751 |
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
| 1752 |
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
| 1753 |
below handles the special case of \], but does not try to do any other escape |
below handles the special case of \], but does not try to do any other escape |
| 1754 |
processing. This makes it different from Perl for cases such as [:l\ower:] |
processing. This makes it different from Perl for cases such as [:l\ower:] |
| 1755 |
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize |
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize |
| 1756 |
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
| 1757 |
I think. |
I think. |
| 1758 |
|
|
| 1759 |
Arguments: |
Arguments: |
| 1771 |
for (++ptr; *ptr != 0; ptr++) |
for (++ptr; *ptr != 0; ptr++) |
| 1772 |
{ |
{ |
| 1773 |
if (*ptr == '\\' && ptr[1] == ']') ptr++; else |
if (*ptr == '\\' && ptr[1] == ']') ptr++; else |
| 1774 |
{ |
{ |
| 1775 |
if (*ptr == ']') return FALSE; |
if (*ptr == ']') return FALSE; |
| 1776 |
if (*ptr == terminator && ptr[1] == ']') |
if (*ptr == terminator && ptr[1] == ']') |
| 1777 |
{ |
{ |
| 1778 |
*endptr = ptr; |
*endptr = ptr; |
| 1779 |
return TRUE; |
return TRUE; |
| 1780 |
} |
} |
| 1781 |
} |
} |
| 1782 |
} |
} |
| 1783 |
return FALSE; |
return FALSE; |
| 1784 |
} |
} |
| 1785 |
|
|
| 2376 |
BOOL class_utf8; |
BOOL class_utf8; |
| 2377 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
| 2378 |
uschar *class_utf8data; |
uschar *class_utf8data; |
| 2379 |
|
uschar *class_utf8data_base; |
| 2380 |
uschar utf8_char[6]; |
uschar utf8_char[6]; |
| 2381 |
#else |
#else |
| 2382 |
BOOL utf8 = FALSE; |
BOOL utf8 = FALSE; |
| 2688 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 2689 |
class_utf8 = FALSE; /* No chars >= 256 */ |
class_utf8 = FALSE; /* No chars >= 256 */ |
| 2690 |
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ |
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ |
| 2691 |
|
class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ |
| 2692 |
#endif |
#endif |
| 2693 |
|
|
| 2694 |
/* Process characters until ] is reached. By writing this as a "do" it |
/* Process characters until ] is reached. By writing this as a "do" it |
| 2704 |
{ /* Braces are required because the */ |
{ /* Braces are required because the */ |
| 2705 |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
| 2706 |
} |
} |
| 2707 |
|
|
| 2708 |
|
/* In the pre-compile phase, accumulate the length of any UTF-8 extra |
| 2709 |
|
data and reset the pointer. This is so that very large classes that |
| 2710 |
|
contain a zillion UTF-8 characters no longer overwrite the work space |
| 2711 |
|
(which is on the stack). */ |
| 2712 |
|
|
| 2713 |
|
if (lengthptr != NULL) |
| 2714 |
|
{ |
| 2715 |
|
*lengthptr += class_utf8data - class_utf8data_base; |
| 2716 |
|
class_utf8data = class_utf8data_base; |
| 2717 |
|
} |
| 2718 |
|
|
| 2719 |
#endif |
#endif |
| 2720 |
|
|
| 2721 |
/* Inside \Q...\E everything is literal except \E */ |
/* Inside \Q...\E everything is literal except \E */ |
| 5821 |
|
|
| 5822 |
uschar cworkspace[COMPILE_WORK_SIZE]; |
uschar cworkspace[COMPILE_WORK_SIZE]; |
| 5823 |
|
|
|
|
|
| 5824 |
/* Set this early so that early errors get offset 0. */ |
/* Set this early so that early errors get offset 0. */ |
| 5825 |
|
|
| 5826 |
ptr = (const uschar *)pattern; |
ptr = (const uschar *)pattern; |