/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 574 - (hide annotations) (download)
Sat Nov 20 17:47:27 2010 UTC (2 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 234633 byte(s)
Give error if \c is followed by a byte > 127 (in ASCII/UTF-8 modes).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 574 "\\c must be followed by an ASCII character\0"
412 ph10 510 ;
413 nigel 77
414     /* Table to identify digits and hex digits. This is used when compiling
415     patterns. Note that the tables in chartables are dependent on the locale, and
416     may mark arbitrary characters as digits - but the PCRE compiling code expects
417     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
418     a private table here. It costs 256 bytes, but it is a lot faster than doing
419     character value tests (at least in some simple cases I timed), and in some
420     applications one wants PCRE to compile efficiently as well as match
421     efficiently.
422    
423     For convenience, we use the same bit definitions as in chartables:
424    
425     0x04 decimal digit
426     0x08 hexadecimal digit
427    
428     Then we can use ctype_digit and ctype_xdigit in the code. */
429    
430 ph10 392 #ifndef EBCDIC
431 ph10 391
432 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
433 ph10 391 UTF-8 mode. */
434    
435 nigel 77 static const unsigned char digitab[] =
436     {
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
443     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
444     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
445     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
449     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
469    
470 ph10 392 #else
471 ph10 391
472     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
473    
474 nigel 77 static const unsigned char digitab[] =
475     {
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
487 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
488 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
492     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
500     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
506     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
507     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
508    
509     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
510     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
511     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
512     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
513     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
514     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
518     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
519     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
521 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
522 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
523     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
526     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
527     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
528     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
529     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
530     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
531     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
532     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
533     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
534     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
535     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
536     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
537     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
538     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
539     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
540     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
541     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
542     #endif
543    
544    
545     /* Definition to allow mutual recursion */
546    
547     static BOOL
548 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
549 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
550 nigel 77
551    
552    
553     /*************************************************
554 ph10 240 * Find an error text *
555     *************************************************/
556    
557 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
558     some of the text is of unknown length, we can't use a table of offsets.
559     Instead, just count through the strings. This is not a performance issue
560 ph10 240 because it happens only when there has been a compilation error.
561    
562     Argument: the error number
563     Returns: pointer to the error string
564     */
565    
566     static const char *
567     find_error_text(int n)
568     {
569     const char *s = error_texts;
570 ph10 507 for (; n > 0; n--)
571 ph10 499 {
572     while (*s++ != 0) {};
573     if (*s == 0) return "Error text not found (please report)";
574 ph10 507 }
575 ph10 240 return s;
576     }
577    
578    
579     /*************************************************
580 nigel 77 * Handle escapes *
581     *************************************************/
582    
583     /* This function is called when a \ has been encountered. It either returns a
584     positive value for a simple escape such as \n, or a negative value which
585 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
586     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
587     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
588     ptr is pointing at the \. On exit, it is on the final character of the escape
589     sequence.
590 nigel 77
591     Arguments:
592     ptrptr points to the pattern position pointer
593     errorcodeptr points to the errorcode variable
594     bracount number of previous extracting brackets
595     options the options bits
596     isclass TRUE if inside a character class
597    
598     Returns: zero or positive => a data character
599     negative => a special escape sequence
600 ph10 213 on error, errorcodeptr is set
601 nigel 77 */
602    
603     static int
604     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
605     int options, BOOL isclass)
606     {
607 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
608     const uschar *ptr = *ptrptr + 1;
609 nigel 77 int c, i;
610    
611 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
612     ptr--; /* Set pointer back to the last byte */
613    
614 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
615    
616     if (c == 0) *errorcodeptr = ERR1;
617    
618 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
619     in a table. A non-zero result is something that can be returned immediately.
620 nigel 77 Otherwise further processing may be required. */
621    
622 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
623     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
624     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
625 nigel 77
626 ph10 97 #else /* EBCDIC coding */
627 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
628 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
629     #endif
630    
631     /* Escapes that need further processing, or are illegal. */
632    
633     else
634     {
635     const uschar *oldptr;
636 nigel 93 BOOL braced, negated;
637    
638 nigel 77 switch (c)
639     {
640     /* A number of Perl escapes are not handled by PCRE. We give an explicit
641     error. */
642    
643 ph10 391 case CHAR_l:
644     case CHAR_L:
645     case CHAR_u:
646     case CHAR_U:
647 nigel 77 *errorcodeptr = ERR37;
648     break;
649    
650 ph10 333 /* \g must be followed by one of a number of specific things:
651 ph10 345
652 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
653     backreference. If negative, it is a relative backreference. This is a Perl
654     5.10 feature.
655 ph10 345
656 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
657     is part of Perl's movement towards a unified syntax for back references. As
658     this is synonymous with \k{name}, we fudge it up by pretending it really
659     was \k.
660 ph10 345
661     (3) For Oniguruma compatibility we also support \g followed by a name or a
662     number either in angle brackets or in single quotes. However, these are
663     (possibly recursive) subroutine calls, _not_ backreferences. Just return
664 ph10 333 the -ESC_g code (cf \k). */
665 nigel 93
666 ph10 391 case CHAR_g:
667     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
668 ph10 333 {
669     c = -ESC_g;
670 ph10 345 break;
671     }
672 ph10 333
673     /* Handle the Perl-compatible cases */
674 ph10 345
675 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
676 nigel 93 {
677 ph10 171 const uschar *p;
678 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
679     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
680     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
681 ph10 171 {
682     c = -ESC_k;
683     break;
684 ph10 172 }
685 nigel 93 braced = TRUE;
686     ptr++;
687     }
688     else braced = FALSE;
689    
690 ph10 391 if (ptr[1] == CHAR_MINUS)
691 nigel 93 {
692     negated = TRUE;
693     ptr++;
694     }
695     else negated = FALSE;
696    
697     c = 0;
698     while ((digitab[ptr[1]] & ctype_digit) != 0)
699 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
700 ph10 220
701 ph10 333 if (c < 0) /* Integer overflow */
702 ph10 213 {
703     *errorcodeptr = ERR61;
704     break;
705 ph10 220 }
706 ph10 345
707 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
708 nigel 93 {
709     *errorcodeptr = ERR57;
710 ph10 213 break;
711 nigel 93 }
712 ph10 345
713 ph10 333 if (c == 0)
714     {
715     *errorcodeptr = ERR58;
716     break;
717 ph10 345 }
718 nigel 93
719     if (negated)
720     {
721     if (c > bracount)
722     {
723     *errorcodeptr = ERR15;
724 ph10 213 break;
725 nigel 93 }
726     c = bracount - (c - 1);
727     }
728    
729     c = -(ESC_REF + c);
730     break;
731    
732 nigel 77 /* The handling of escape sequences consisting of a string of digits
733     starting with one that is not zero is not straightforward. By experiment,
734     the way Perl works seems to be as follows:
735    
736     Outside a character class, the digits are read as a decimal number. If the
737     number is less than 10, or if there are that many previous extracting
738     left brackets, then it is a back reference. Otherwise, up to three octal
739     digits are read to form an escaped byte. Thus \123 is likely to be octal
740     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
741     value is greater than 377, the least significant 8 bits are taken. Inside a
742     character class, \ followed by a digit is always an octal number. */
743    
744 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
745     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
746 nigel 77
747     if (!isclass)
748     {
749     oldptr = ptr;
750 ph10 391 c -= CHAR_0;
751 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
752 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
753 ph10 333 if (c < 0) /* Integer overflow */
754 ph10 213 {
755     *errorcodeptr = ERR61;
756 ph10 220 break;
757     }
758 nigel 77 if (c < 10 || c <= bracount)
759     {
760     c = -(ESC_REF + c);
761     break;
762     }
763     ptr = oldptr; /* Put the pointer back and fall through */
764     }
765    
766     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
767     generates a binary zero byte and treats the digit as a following literal.
768     Thus we have to pull back the pointer by one. */
769    
770 ph10 391 if ((c = *ptr) >= CHAR_8)
771 nigel 77 {
772     ptr--;
773     c = 0;
774     break;
775     }
776    
777     /* \0 always starts an octal number, but we may drop through to here with a
778 nigel 91 larger first octal digit. The original code used just to take the least
779     significant 8 bits of octal numbers (I think this is what early Perls used
780     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
781     than 3 octal digits. */
782 nigel 77
783 ph10 391 case CHAR_0:
784     c -= CHAR_0;
785     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
786     c = c * 8 + *(++ptr) - CHAR_0;
787 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
788 nigel 77 break;
789    
790 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
791     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
792     treated as a data character. */
793 nigel 77
794 ph10 391 case CHAR_x:
795     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
796 nigel 77 {
797     const uschar *pt = ptr + 2;
798 nigel 87 int count = 0;
799    
800 nigel 77 c = 0;
801     while ((digitab[*pt] & ctype_xdigit) != 0)
802     {
803 nigel 87 register int cc = *pt++;
804 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
805 nigel 77 count++;
806 nigel 87
807 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
808     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
809     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
810 ph10 97 #else /* EBCDIC coding */
811 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
812     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
813 nigel 77 #endif
814     }
815 nigel 87
816 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
817 nigel 77 {
818 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
819 nigel 77 ptr = pt;
820     break;
821     }
822 nigel 87
823 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
824     recognize this construct; fall through to the normal \x handling. */
825     }
826    
827 nigel 87 /* Read just a single-byte hex-defined char */
828 nigel 77
829     c = 0;
830     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
831     {
832 ph10 391 int cc; /* Some compilers don't like */
833     cc = *(++ptr); /* ++ in initializers */
834     #ifndef EBCDIC /* ASCII/UTF-8 coding */
835     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
836     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
837 ph10 97 #else /* EBCDIC coding */
838 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
839     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
840 nigel 77 #endif
841     }
842     break;
843    
844 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
845 ph10 574 An error is given if the byte following \c is not an ASCII character. This
846     coding is ASCII-specific, but then the whole concept of \cx is
847 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
848 nigel 77
849 ph10 391 case CHAR_c:
850 nigel 77 c = *(++ptr);
851     if (c == 0)
852     {
853     *errorcodeptr = ERR2;
854 ph10 213 break;
855 nigel 77 }
856 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
857     if (c > 127) /* Excludes all non-ASCII in either mode */
858     {
859     *errorcodeptr = ERR68;
860     break;
861     }
862 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
863 nigel 77 c ^= 0x40;
864 ph10 574 #else /* EBCDIC coding */
865 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
866 nigel 77 c ^= 0xC0;
867     #endif
868     break;
869    
870     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
871 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
872     otherwise, for Perl compatibility, it is a literal. This code looks a bit
873     odd, but there used to be some cases other than the default, and there may
874     be again in future, so I haven't "optimized" it. */
875 nigel 77
876     default:
877     if ((options & PCRE_EXTRA) != 0) switch(c)
878     {
879     default:
880     *errorcodeptr = ERR3;
881     break;
882     }
883     break;
884     }
885     }
886 ph10 518
887     /* Perl supports \N{name} for character names, as well as plain \N for "not
888 ph10 514 newline". PCRE does not support \N{name}. */
889 nigel 77
890 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
891 ph10 518 *errorcodeptr = ERR37;
892 ph10 514
893 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
894    
895     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
896     c -= (ESC_DU - ESC_D);
897    
898     /* Set the pointer to the final character before returning. */
899    
900 nigel 77 *ptrptr = ptr;
901     return c;
902     }
903    
904    
905    
906     #ifdef SUPPORT_UCP
907     /*************************************************
908     * Handle \P and \p *
909     *************************************************/
910    
911     /* This function is called after \P or \p has been encountered, provided that
912     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
913     pointing at the P or p. On exit, it is pointing at the final character of the
914     escape sequence.
915    
916     Argument:
917     ptrptr points to the pattern position pointer
918     negptr points to a boolean that is set TRUE for negation else FALSE
919 nigel 87 dptr points to an int that is set to the detailed property value
920 nigel 77 errorcodeptr points to the error code variable
921    
922 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
923 nigel 77 */
924    
925     static int
926 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
927 nigel 77 {
928     int c, i, bot, top;
929     const uschar *ptr = *ptrptr;
930 nigel 87 char name[32];
931 nigel 77
932     c = *(++ptr);
933     if (c == 0) goto ERROR_RETURN;
934    
935     *negptr = FALSE;
936    
937 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
938     negation. */
939 nigel 77
940 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
941 nigel 77 {
942 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
943 nigel 77 {
944     *negptr = TRUE;
945     ptr++;
946     }
947 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
948 nigel 77 {
949     c = *(++ptr);
950     if (c == 0) goto ERROR_RETURN;
951 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
952 nigel 77 name[i] = c;
953     }
954 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
955 nigel 77 name[i] = 0;
956     }
957    
958     /* Otherwise there is just one following character */
959    
960     else
961     {
962     name[0] = c;
963     name[1] = 0;
964     }
965    
966     *ptrptr = ptr;
967    
968     /* Search for a recognized property name using binary chop */
969    
970     bot = 0;
971     top = _pcre_utt_size;
972    
973     while (bot < top)
974     {
975 nigel 87 i = (bot + top) >> 1;
976 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
977 nigel 87 if (c == 0)
978     {
979     *dptr = _pcre_utt[i].value;
980     return _pcre_utt[i].type;
981     }
982 nigel 77 if (c > 0) bot = i + 1; else top = i;
983     }
984    
985     *errorcodeptr = ERR47;
986     *ptrptr = ptr;
987     return -1;
988    
989     ERROR_RETURN:
990     *errorcodeptr = ERR46;
991     *ptrptr = ptr;
992     return -1;
993     }
994     #endif
995    
996    
997    
998    
999     /*************************************************
1000     * Check for counted repeat *
1001     *************************************************/
1002    
1003     /* This function is called when a '{' is encountered in a place where it might
1004     start a quantifier. It looks ahead to see if it really is a quantifier or not.
1005     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1006     where the ddds are digits.
1007    
1008     Arguments:
1009     p pointer to the first char after '{'
1010    
1011     Returns: TRUE or FALSE
1012     */
1013    
1014     static BOOL
1015     is_counted_repeat(const uschar *p)
1016     {
1017     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018     while ((digitab[*p] & ctype_digit) != 0) p++;
1019 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1020 nigel 77
1021 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1022     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1023 nigel 77
1024     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1025     while ((digitab[*p] & ctype_digit) != 0) p++;
1026    
1027 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1028 nigel 77 }
1029    
1030    
1031    
1032     /*************************************************
1033     * Read repeat counts *
1034     *************************************************/
1035    
1036     /* Read an item of the form {n,m} and return the values. This is called only
1037     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1038     so the syntax is guaranteed to be correct, but we need to check the values.
1039    
1040     Arguments:
1041     p pointer to first char after '{'
1042     minp pointer to int for min
1043     maxp pointer to int for max
1044     returned as -1 if no max
1045     errorcodeptr points to error code variable
1046    
1047     Returns: pointer to '}' on success;
1048     current ptr on error, with errorcodeptr set non-zero
1049     */
1050    
1051     static const uschar *
1052     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1053     {
1054     int min = 0;
1055     int max = -1;
1056    
1057 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1058     an integer overflow. */
1059    
1060 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1061 nigel 81 if (min < 0 || min > 65535)
1062     {
1063     *errorcodeptr = ERR5;
1064     return p;
1065     }
1066 nigel 77
1067 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1068     Also, max must not be less than min. */
1069    
1070 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1071 nigel 77 {
1072 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1073 nigel 77 {
1074     max = 0;
1075 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1076 nigel 81 if (max < 0 || max > 65535)
1077     {
1078     *errorcodeptr = ERR5;
1079     return p;
1080     }
1081 nigel 77 if (max < min)
1082     {
1083     *errorcodeptr = ERR4;
1084     return p;
1085     }
1086     }
1087     }
1088    
1089 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1090     '}'. */
1091 nigel 77
1092 nigel 81 *minp = min;
1093     *maxp = max;
1094 nigel 77 return p;
1095     }
1096    
1097    
1098    
1099     /*************************************************
1100 ph10 408 * Subroutine for finding forward reference *
1101 nigel 91 *************************************************/
1102    
1103 ph10 408 /* This recursive function is called only from find_parens() below. The
1104     top-level call starts at the beginning of the pattern. All other calls must
1105     start at a parenthesis. It scans along a pattern's text looking for capturing
1106 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1107     name it is given, it returns its number. Alternatively, if the name is NULL, it
1108 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1109     encountered, the name will be terminated by '>' because that is checked in the
1110 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1111 ph10 408 capturing group numbers - the (?| feature.
1112 nigel 91
1113     Arguments:
1114 ph10 408 ptrptr address of the current character pointer (updated)
1115 ph10 345 cd compile background data
1116 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1117     lorn name length, or subpattern number if name is NULL
1118     xmode TRUE if we are in /x mode
1119 ph10 556 utf8 TRUE if we are in UTF-8 mode
1120 ph10 411 count pointer to the current capturing subpattern number (updated)
1121 nigel 91
1122     Returns: the number of the named subpattern, or -1 if not found
1123     */
1124    
1125     static int
1126 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1127 ph10 556 BOOL xmode, BOOL utf8, int *count)
1128 nigel 91 {
1129 ph10 408 uschar *ptr = *ptrptr;
1130     int start_count = *count;
1131     int hwm_count = start_count;
1132     BOOL dup_parens = FALSE;
1133 nigel 93
1134 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1135 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1136    
1137     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1138     {
1139 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1140 ph10 545
1141 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1142 ph10 545
1143 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1144 ph10 408
1145 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1146 ph10 408 {
1147     *count += 1;
1148     if (name == NULL && *count == lorn) return *count;
1149 ph10 411 ptr++;
1150 ph10 408 }
1151    
1152 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1153     where the parenthesis numbers are duplicated. */
1154    
1155     else if (ptr[2] == CHAR_VERTICAL_LINE)
1156     {
1157     ptr += 3;
1158     dup_parens = TRUE;
1159     }
1160 ph10 545
1161 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1162    
1163     else if (ptr[2] == CHAR_NUMBER_SIGN)
1164     {
1165     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1166     goto FAIL_EXIT;
1167 ph10 545 }
1168 ph10 544
1169 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1170     is processed as normal. If not, skip to the closing parenthesis of the
1171 ph10 544 condition (there can't be any nested parens). */
1172 ph10 411
1173 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1174     {
1175 ph10 411 ptr += 2;
1176 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1177     {
1178     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1179 ph10 411 if (*ptr != 0) ptr++;
1180 ph10 408 }
1181 ph10 411 }
1182    
1183 ph10 544 /* Start with (? but not a condition. */
1184 ph10 408
1185     else
1186 ph10 411 {
1187 ph10 408 ptr += 2;
1188     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1189    
1190     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1191 ph10 411
1192 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1193     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1194     {
1195     int term;
1196     const uschar *thisname;
1197     *count += 1;
1198     if (name == NULL && *count == lorn) return *count;
1199     term = *ptr++;
1200     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1201     thisname = ptr;
1202     while (*ptr != term) ptr++;
1203     if (name != NULL && lorn == ptr - thisname &&
1204     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1205     return *count;
1206 ph10 461 term++;
1207 ph10 411 }
1208 ph10 408 }
1209 ph10 411 }
1210 ph10 408
1211 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1212 ph10 408 bars. */
1213    
1214 nigel 91 for (; *ptr != 0; ptr++)
1215     {
1216 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1217    
1218 ph10 391 if (*ptr == CHAR_BACKSLASH)
1219 nigel 93 {
1220 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1221 ph10 391 if (*ptr == CHAR_Q) for (;;)
1222 nigel 93 {
1223 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1224 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1225 ph10 391 if (*(++ptr) == CHAR_E) break;
1226 nigel 93 }
1227     continue;
1228     }
1229    
1230 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1231     are handled for real. If the first character is '^', skip it. Also, if the
1232     first few characters (either before or after ^) are \Q\E or \E we skip them
1233 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1234 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1235 nigel 93
1236 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1237 nigel 93 {
1238 ph10 340 BOOL negate_class = FALSE;
1239     for (;;)
1240     {
1241 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1242 ph10 340 {
1243 ph10 438 if (ptr[2] == CHAR_E)
1244     ptr+= 2;
1245     else if (strncmp((const char *)ptr+2,
1246 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1247 ph10 438 ptr += 4;
1248 ph10 392 else
1249 ph10 391 break;
1250 ph10 340 }
1251 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1252 ph10 461 {
1253 ph10 340 negate_class = TRUE;
1254 ph10 438 ptr++;
1255 ph10 461 }
1256 ph10 340 else break;
1257     }
1258    
1259     /* If the next character is ']', it is a data character that must be
1260 ph10 341 skipped, except in JavaScript compatibility mode. */
1261 ph10 345
1262 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1263 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1264 ph10 345 ptr++;
1265    
1266 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1267 nigel 93 {
1268 ph10 220 if (*ptr == 0) return -1;
1269 ph10 391 if (*ptr == CHAR_BACKSLASH)
1270 nigel 93 {
1271 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1272 ph10 391 if (*ptr == CHAR_Q) for (;;)
1273 nigel 93 {
1274 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1275 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1276 ph10 391 if (*(++ptr) == CHAR_E) break;
1277 nigel 93 }
1278     continue;
1279     }
1280     }
1281     continue;
1282     }
1283    
1284     /* Skip comments in /x mode */
1285    
1286 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1287 nigel 93 {
1288 ph10 556 ptr++;
1289     while (*ptr != 0)
1290     {
1291     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1292     ptr++;
1293     #ifdef SUPPORT_UTF8
1294     if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1295     #endif
1296     }
1297 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1298 nigel 93 continue;
1299     }
1300    
1301 ph10 408 /* Check for the special metacharacters */
1302 ph10 411
1303 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1304 nigel 93 {
1305 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1306 ph10 408 if (rc > 0) return rc;
1307     if (*ptr == 0) goto FAIL_EXIT;
1308 nigel 93 }
1309 ph10 411
1310 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1311     {
1312     if (dup_parens && *count < hwm_count) *count = hwm_count;
1313 ph10 545 goto FAIL_EXIT;
1314 ph10 408 }
1315 ph10 411
1316     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1317 ph10 408 {
1318     if (*count > hwm_count) hwm_count = *count;
1319     *count = start_count;
1320 ph10 411 }
1321 ph10 408 }
1322 nigel 93
1323 ph10 408 FAIL_EXIT:
1324     *ptrptr = ptr;
1325     return -1;
1326     }
1327 nigel 93
1328    
1329    
1330    
1331 ph10 408 /*************************************************
1332     * Find forward referenced subpattern *
1333     *************************************************/
1334 nigel 93
1335 ph10 408 /* This function scans along a pattern's text looking for capturing
1336     subpatterns, and counting them. If it finds a named pattern that matches the
1337     name it is given, it returns its number. Alternatively, if the name is NULL, it
1338     returns when it reaches a given numbered subpattern. This is used for forward
1339     references to subpatterns. We used to be able to start this scan from the
1340     current compiling point, using the current count value from cd->bracount, and
1341     do it all in a single loop, but the addition of the possibility of duplicate
1342     subpattern numbers means that we have to scan from the very start, in order to
1343     take account of such duplicates, and to use a recursive function to keep track
1344     of the different types of group.
1345    
1346     Arguments:
1347     cd compile background data
1348     name name to seek, or NULL if seeking a numbered subpattern
1349     lorn name length, or subpattern number if name is NULL
1350     xmode TRUE if we are in /x mode
1351 ph10 556 utf8 TRUE if we are in UTF-8 mode
1352 ph10 408
1353     Returns: the number of the found subpattern, or -1 if not found
1354     */
1355    
1356     static int
1357 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1358     BOOL utf8)
1359 ph10 408 {
1360     uschar *ptr = (uschar *)cd->start_pattern;
1361     int count = 0;
1362     int rc;
1363    
1364     /* If the pattern does not start with an opening parenthesis, the first call
1365     to find_parens_sub() will scan right to the end (if necessary). However, if it
1366     does start with a parenthesis, find_parens_sub() will return when it hits the
1367     matching closing parens. That is why we have to have a loop. */
1368    
1369 ph10 411 for (;;)
1370     {
1371 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1372 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1373     }
1374    
1375 ph10 408 return rc;
1376 nigel 91 }
1377    
1378    
1379    
1380 ph10 408
1381 nigel 91 /*************************************************
1382 nigel 77 * Find first significant op code *
1383     *************************************************/
1384    
1385     /* This is called by several functions that scan a compiled expression looking
1386     for a fixed first character, or an anchoring op code etc. It skips over things
1387     that do not influence this. For some calls, a change of option is important.
1388     For some calls, it makes sense to skip negative forward and all backward
1389     assertions, and also the \b assertion; for others it does not.
1390    
1391     Arguments:
1392     code pointer to the start of the group
1393     options pointer to external options
1394     optbit the option bit whose changing is significant, or
1395     zero if none are
1396     skipassert TRUE if certain assertions are to be skipped
1397    
1398     Returns: pointer to the first significant opcode
1399     */
1400    
1401     static const uschar*
1402     first_significant_code(const uschar *code, int *options, int optbit,
1403     BOOL skipassert)
1404     {
1405     for (;;)
1406     {
1407     switch ((int)*code)
1408     {
1409     case OP_OPT:
1410     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1411     *options = (int)code[1];
1412     code += 2;
1413     break;
1414    
1415     case OP_ASSERT_NOT:
1416     case OP_ASSERTBACK:
1417     case OP_ASSERTBACK_NOT:
1418     if (!skipassert) return code;
1419     do code += GET(code, 1); while (*code == OP_ALT);
1420     code += _pcre_OP_lengths[*code];
1421     break;
1422    
1423     case OP_WORD_BOUNDARY:
1424     case OP_NOT_WORD_BOUNDARY:
1425     if (!skipassert) return code;
1426     /* Fall through */
1427    
1428     case OP_CALLOUT:
1429     case OP_CREF:
1430 ph10 459 case OP_NCREF:
1431 nigel 93 case OP_RREF:
1432 ph10 459 case OP_NRREF:
1433 nigel 93 case OP_DEF:
1434 nigel 77 code += _pcre_OP_lengths[*code];
1435     break;
1436    
1437     default:
1438     return code;
1439     }
1440     }
1441     /* Control never reaches here */
1442     }
1443    
1444    
1445    
1446    
1447     /*************************************************
1448 ph10 454 * Find the fixed length of a branch *
1449 nigel 77 *************************************************/
1450    
1451 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1452 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1453 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1454 ph10 454 temporarily terminated with OP_END when this function is called.
1455 nigel 77
1456 ph10 461 This function is called when a backward assertion is encountered, so that if it
1457     fails, the error message can point to the correct place in the pattern.
1458 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1459 ph10 461 because they can be forward references. We solve this by remembering this case
1460 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1461    
1462 nigel 77 Arguments:
1463     code points to the start of the pattern (the bracket)
1464     options the compiling options
1465 ph10 461 atend TRUE if called when the pattern is complete
1466     cd the "compile data" structure
1467 nigel 77
1468 ph10 461 Returns: the fixed length,
1469 ph10 454 or -1 if there is no fixed length,
1470 nigel 77 or -2 if \C was encountered
1471 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1472 nigel 77 */
1473    
1474     static int
1475 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1476 nigel 77 {
1477     int length = -1;
1478    
1479     register int branchlength = 0;
1480     register uschar *cc = code + 1 + LINK_SIZE;
1481    
1482     /* Scan along the opcodes for this branch. If we get to the end of the
1483     branch, check the length against that of the other branches. */
1484    
1485     for (;;)
1486     {
1487     int d;
1488 ph10 454 uschar *ce, *cs;
1489 nigel 77 register int op = *cc;
1490     switch (op)
1491     {
1492 nigel 93 case OP_CBRA:
1493 nigel 77 case OP_BRA:
1494     case OP_ONCE:
1495     case OP_COND:
1496 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1497 nigel 77 if (d < 0) return d;
1498     branchlength += d;
1499     do cc += GET(cc, 1); while (*cc == OP_ALT);
1500     cc += 1 + LINK_SIZE;
1501     break;
1502    
1503     /* Reached end of a branch; if it's a ket it is the end of a nested
1504     call. If it's ALT it is an alternation in a nested call. If it is
1505     END it's the end of the outer call. All can be handled by the same code. */
1506    
1507     case OP_ALT:
1508     case OP_KET:
1509     case OP_KETRMAX:
1510     case OP_KETRMIN:
1511     case OP_END:
1512     if (length < 0) length = branchlength;
1513     else if (length != branchlength) return -1;
1514     if (*cc != OP_ALT) return length;
1515     cc += 1 + LINK_SIZE;
1516     branchlength = 0;
1517     break;
1518 ph10 461
1519 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1520     be OK. If the subroutine is a forward reference, we can't deal with
1521     it until the end of the pattern, so return -3. */
1522 ph10 461
1523 ph10 454 case OP_RECURSE:
1524     if (!atend) return -3;
1525     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1526     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1527     if (cc > cs && cc < ce) return -1; /* Recursion */
1528     d = find_fixedlength(cs + 2, options, atend, cd);
1529 ph10 461 if (d < 0) return d;
1530 ph10 454 branchlength += d;
1531     cc += 1 + LINK_SIZE;
1532 ph10 461 break;
1533 nigel 77
1534     /* Skip over assertive subpatterns */
1535    
1536     case OP_ASSERT:
1537     case OP_ASSERT_NOT:
1538     case OP_ASSERTBACK:
1539     case OP_ASSERTBACK_NOT:
1540     do cc += GET(cc, 1); while (*cc == OP_ALT);
1541     /* Fall through */
1542    
1543     /* Skip over things that don't match chars */
1544    
1545     case OP_REVERSE:
1546     case OP_CREF:
1547 ph10 459 case OP_NCREF:
1548 nigel 93 case OP_RREF:
1549 ph10 459 case OP_NRREF:
1550 nigel 93 case OP_DEF:
1551 nigel 77 case OP_OPT:
1552     case OP_CALLOUT:
1553     case OP_SOD:
1554     case OP_SOM:
1555 ph10 500 case OP_SET_SOM:
1556 nigel 77 case OP_EOD:
1557     case OP_EODN:
1558     case OP_CIRC:
1559     case OP_DOLL:
1560     case OP_NOT_WORD_BOUNDARY:
1561     case OP_WORD_BOUNDARY:
1562     cc += _pcre_OP_lengths[*cc];
1563     break;
1564    
1565     /* Handle literal characters */
1566    
1567     case OP_CHAR:
1568     case OP_CHARNC:
1569 nigel 91 case OP_NOT:
1570 nigel 77 branchlength++;
1571     cc += 2;
1572     #ifdef SUPPORT_UTF8
1573 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1574 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1575 nigel 77 #endif
1576     break;
1577    
1578     /* Handle exact repetitions. The count is already in characters, but we
1579     need to skip over a multibyte character in UTF8 mode. */
1580    
1581     case OP_EXACT:
1582     branchlength += GET2(cc,1);
1583     cc += 4;
1584     #ifdef SUPPORT_UTF8
1585 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1586 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1587 nigel 77 #endif
1588     break;
1589    
1590     case OP_TYPEEXACT:
1591     branchlength += GET2(cc,1);
1592 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1593 nigel 77 cc += 4;
1594     break;
1595    
1596     /* Handle single-char matchers */
1597    
1598     case OP_PROP:
1599     case OP_NOTPROP:
1600 nigel 87 cc += 2;
1601 nigel 77 /* Fall through */
1602    
1603     case OP_NOT_DIGIT:
1604     case OP_DIGIT:
1605     case OP_NOT_WHITESPACE:
1606     case OP_WHITESPACE:
1607     case OP_NOT_WORDCHAR:
1608     case OP_WORDCHAR:
1609     case OP_ANY:
1610 ph10 342 case OP_ALLANY:
1611 nigel 77 branchlength++;
1612     cc++;
1613     break;
1614    
1615     /* The single-byte matcher isn't allowed */
1616    
1617     case OP_ANYBYTE:
1618     return -2;
1619    
1620     /* Check a class for variable quantification */
1621    
1622     #ifdef SUPPORT_UTF8
1623     case OP_XCLASS:
1624     cc += GET(cc, 1) - 33;
1625     /* Fall through */
1626     #endif
1627    
1628     case OP_CLASS:
1629     case OP_NCLASS:
1630     cc += 33;
1631    
1632     switch (*cc)
1633     {
1634     case OP_CRSTAR:
1635     case OP_CRMINSTAR:
1636     case OP_CRQUERY:
1637     case OP_CRMINQUERY:
1638     return -1;
1639    
1640     case OP_CRRANGE:
1641     case OP_CRMINRANGE:
1642     if (GET2(cc,1) != GET2(cc,3)) return -1;
1643     branchlength += GET2(cc,1);
1644     cc += 5;
1645     break;
1646    
1647     default:
1648     branchlength++;
1649     }
1650     break;
1651    
1652     /* Anything else is variable length */
1653    
1654     default:
1655     return -1;
1656     }
1657     }
1658     /* Control never gets here */
1659     }
1660    
1661    
1662    
1663    
1664     /*************************************************
1665 ph10 454 * Scan compiled regex for specific bracket *
1666 nigel 77 *************************************************/
1667    
1668     /* This little function scans through a compiled pattern until it finds a
1669 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1670 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1671     so that it can be called from pcre_study() when finding the minimum matching
1672 ph10 455 length.
1673 nigel 77
1674     Arguments:
1675     code points to start of expression
1676     utf8 TRUE in UTF-8 mode
1677 ph10 454 number the required bracket number or negative to find a lookbehind
1678 nigel 77
1679     Returns: pointer to the opcode for the bracket, or NULL if not found
1680     */
1681    
1682 ph10 455 const uschar *
1683     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1684 nigel 77 {
1685     for (;;)
1686     {
1687     register int c = *code;
1688     if (c == OP_END) return NULL;
1689 nigel 91
1690     /* XCLASS is used for classes that cannot be represented just by a bit
1691     map. This includes negated single high-valued characters. The length in
1692     the table is zero; the actual length is stored in the compiled code. */
1693    
1694     if (c == OP_XCLASS) code += GET(code, 1);
1695 ph10 461
1696 ph10 454 /* Handle recursion */
1697 ph10 461
1698 ph10 454 else if (c == OP_REVERSE)
1699     {
1700 ph10 461 if (number < 0) return (uschar *)code;
1701 ph10 454 code += _pcre_OP_lengths[c];
1702     }
1703 nigel 91
1704 nigel 93 /* Handle capturing bracket */
1705 nigel 91
1706 nigel 93 else if (c == OP_CBRA)
1707 nigel 77 {
1708 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1709 nigel 77 if (n == number) return (uschar *)code;
1710 nigel 93 code += _pcre_OP_lengths[c];
1711 nigel 77 }
1712 nigel 91
1713 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1714     repeated character types, we have to test for \p and \P, which have an extra
1715 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1716 ph10 510 must add in its length. */
1717 nigel 91
1718 nigel 77 else
1719     {
1720 ph10 218 switch(c)
1721     {
1722     case OP_TYPESTAR:
1723     case OP_TYPEMINSTAR:
1724     case OP_TYPEPLUS:
1725     case OP_TYPEMINPLUS:
1726     case OP_TYPEQUERY:
1727     case OP_TYPEMINQUERY:
1728     case OP_TYPEPOSSTAR:
1729     case OP_TYPEPOSPLUS:
1730     case OP_TYPEPOSQUERY:
1731     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1732 ph10 220 break;
1733 ph10 221
1734     case OP_TYPEUPTO:
1735     case OP_TYPEMINUPTO:
1736     case OP_TYPEEXACT:
1737     case OP_TYPEPOSUPTO:
1738     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1739     break;
1740 ph10 512
1741 ph10 510 case OP_MARK:
1742     case OP_PRUNE_ARG:
1743     case OP_SKIP_ARG:
1744     code += code[1];
1745 ph10 512 break;
1746 ph10 550
1747     case OP_THEN_ARG:
1748     code += code[1+LINK_SIZE];
1749     break;
1750 ph10 220 }
1751    
1752 ph10 218 /* Add in the fixed length from the table */
1753 ph10 220
1754 nigel 77 code += _pcre_OP_lengths[c];
1755 ph10 220
1756 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1757     a multi-byte character. The length in the table is a minimum, so we have to
1758     arrange to skip the extra bytes. */
1759 ph10 220
1760 ph10 107 #ifdef SUPPORT_UTF8
1761 nigel 77 if (utf8) switch(c)
1762     {
1763     case OP_CHAR:
1764     case OP_CHARNC:
1765     case OP_EXACT:
1766     case OP_UPTO:
1767     case OP_MINUPTO:
1768 nigel 93 case OP_POSUPTO:
1769 nigel 77 case OP_STAR:
1770     case OP_MINSTAR:
1771 nigel 93 case OP_POSSTAR:
1772 nigel 77 case OP_PLUS:
1773     case OP_MINPLUS:
1774 nigel 93 case OP_POSPLUS:
1775 nigel 77 case OP_QUERY:
1776     case OP_MINQUERY:
1777 nigel 93 case OP_POSQUERY:
1778     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1779 nigel 77 break;
1780     }
1781 ph10 369 #else
1782     (void)(utf8); /* Keep compiler happy by referencing function argument */
1783 ph10 111 #endif
1784 nigel 77 }
1785     }
1786     }
1787    
1788    
1789    
1790     /*************************************************
1791     * Scan compiled regex for recursion reference *
1792     *************************************************/
1793    
1794     /* This little function scans through a compiled pattern until it finds an
1795     instance of OP_RECURSE.
1796    
1797     Arguments:
1798     code points to start of expression
1799     utf8 TRUE in UTF-8 mode
1800    
1801     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1802     */
1803    
1804     static const uschar *
1805     find_recurse(const uschar *code, BOOL utf8)
1806     {
1807     for (;;)
1808     {
1809     register int c = *code;
1810     if (c == OP_END) return NULL;
1811 nigel 91 if (c == OP_RECURSE) return code;
1812 ph10 220
1813 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1814     map. This includes negated single high-valued characters. The length in
1815     the table is zero; the actual length is stored in the compiled code. */
1816    
1817     if (c == OP_XCLASS) code += GET(code, 1);
1818    
1819 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1820     repeated character types, we have to test for \p and \P, which have an extra
1821 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1822 ph10 510 must add in its length. */
1823 nigel 91
1824 nigel 77 else
1825     {
1826 ph10 218 switch(c)
1827     {
1828     case OP_TYPESTAR:
1829     case OP_TYPEMINSTAR:
1830     case OP_TYPEPLUS:
1831     case OP_TYPEMINPLUS:
1832     case OP_TYPEQUERY:
1833     case OP_TYPEMINQUERY:
1834     case OP_TYPEPOSSTAR:
1835     case OP_TYPEPOSPLUS:
1836     case OP_TYPEPOSQUERY:
1837     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1838 ph10 220 break;
1839 ph10 221
1840     case OP_TYPEPOSUPTO:
1841     case OP_TYPEUPTO:
1842     case OP_TYPEMINUPTO:
1843     case OP_TYPEEXACT:
1844     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1845     break;
1846 ph10 512
1847 ph10 510 case OP_MARK:
1848     case OP_PRUNE_ARG:
1849     case OP_SKIP_ARG:
1850     code += code[1];
1851 ph10 512 break;
1852 ph10 550
1853     case OP_THEN_ARG:
1854     code += code[1+LINK_SIZE];
1855     break;
1856 ph10 220 }
1857    
1858 ph10 218 /* Add in the fixed length from the table */
1859    
1860 nigel 77 code += _pcre_OP_lengths[c];
1861 ph10 220
1862 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1863     by a multi-byte character. The length in the table is a minimum, so we have
1864     to arrange to skip the extra bytes. */
1865 ph10 220
1866 ph10 107 #ifdef SUPPORT_UTF8
1867 nigel 77 if (utf8) switch(c)
1868     {
1869     case OP_CHAR:
1870     case OP_CHARNC:
1871     case OP_EXACT:
1872     case OP_UPTO:
1873     case OP_MINUPTO:
1874 nigel 93 case OP_POSUPTO:
1875 nigel 77 case OP_STAR:
1876     case OP_MINSTAR:
1877 nigel 93 case OP_POSSTAR:
1878 nigel 77 case OP_PLUS:
1879     case OP_MINPLUS:
1880 nigel 93 case OP_POSPLUS:
1881 nigel 77 case OP_QUERY:
1882     case OP_MINQUERY:
1883 nigel 93 case OP_POSQUERY:
1884     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1885 nigel 77 break;
1886     }
1887 ph10 369 #else
1888     (void)(utf8); /* Keep compiler happy by referencing function argument */
1889 ph10 111 #endif
1890 nigel 77 }
1891     }
1892     }
1893    
1894    
1895    
1896     /*************************************************
1897     * Scan compiled branch for non-emptiness *
1898     *************************************************/
1899    
1900     /* This function scans through a branch of a compiled pattern to see whether it
1901 nigel 93 can match the empty string or not. It is called from could_be_empty()
1902     below and from compile_branch() when checking for an unlimited repeat of a
1903     group that can match nothing. Note that first_significant_code() skips over
1904 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1905     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1906     bracket whose current branch will already have been scanned.
1907 nigel 77
1908     Arguments:
1909     code points to start of search
1910     endcode points to where to stop
1911     utf8 TRUE if in UTF8 mode
1912 ph10 503 cd contains pointers to tables etc.
1913 nigel 77
1914     Returns: TRUE if what is matched could be empty
1915     */
1916    
1917     static BOOL
1918 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1919     compile_data *cd)
1920 nigel 77 {
1921     register int c;
1922 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1923 nigel 77 code < endcode;
1924     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1925     {
1926     const uschar *ccode;
1927    
1928     c = *code;
1929 ph10 507
1930 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1931 ph10 282 first_significant_code() with a TRUE final argument. */
1932 ph10 286
1933 ph10 282 if (c == OP_ASSERT)
1934 ph10 286 {
1935 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1936     c = *code;
1937     continue;
1938 ph10 286 }
1939 ph10 172
1940 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1941 nigel 77
1942 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1943 ph10 170 {
1944 ph10 172 code += _pcre_OP_lengths[c];
1945 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1946     c = *code;
1947     continue;
1948     }
1949 ph10 507
1950 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1951     implies a subroutine call, we can scan it. */
1952 ph10 507
1953 ph10 503 if (c == OP_RECURSE)
1954     {
1955 ph10 507 BOOL empty_branch = FALSE;
1956 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1957     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1958     do
1959     {
1960 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1961     {
1962     empty_branch = TRUE;
1963 ph10 507 break;
1964     }
1965 ph10 503 scode += GET(scode, 1);
1966     }
1967     while (*scode == OP_ALT);
1968 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1969 ph10 503 continue;
1970 ph10 507 }
1971 ph10 170
1972     /* For other groups, scan the branches. */
1973 ph10 172
1974 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1975 nigel 77 {
1976     BOOL empty_branch;
1977     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1978 ph10 406
1979     /* If a conditional group has only one branch, there is a second, implied,
1980 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1981     Otherwise, scan the individual branches of the group. */
1982 ph10 406
1983 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1984 nigel 77 code += GET(code, 1);
1985 ph10 395 else
1986 ph10 406 {
1987 ph10 395 empty_branch = FALSE;
1988     do
1989     {
1990 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1991 ph10 395 empty_branch = TRUE;
1992     code += GET(code, 1);
1993     }
1994     while (*code == OP_ALT);
1995     if (!empty_branch) return FALSE; /* All branches are non-empty */
1996 nigel 77 }
1997 ph10 406
1998 ph10 172 c = *code;
1999 nigel 93 continue;
2000 nigel 77 }
2001    
2002 nigel 93 /* Handle the other opcodes */
2003    
2004     switch (c)
2005 nigel 77 {
2006 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2007     cannot be represented just by a bit map. This includes negated single
2008     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2009 ph10 220 actual length is stored in the compiled code, so we must update "code"
2010 ph10 216 here. */
2011 nigel 77
2012     #ifdef SUPPORT_UTF8
2013     case OP_XCLASS:
2014 ph10 216 ccode = code += GET(code, 1);
2015 nigel 77 goto CHECK_CLASS_REPEAT;
2016     #endif
2017    
2018     case OP_CLASS:
2019     case OP_NCLASS:
2020     ccode = code + 33;
2021    
2022     #ifdef SUPPORT_UTF8
2023     CHECK_CLASS_REPEAT:
2024     #endif
2025    
2026     switch (*ccode)
2027     {
2028     case OP_CRSTAR: /* These could be empty; continue */
2029     case OP_CRMINSTAR:
2030     case OP_CRQUERY:
2031     case OP_CRMINQUERY:
2032     break;
2033    
2034     default: /* Non-repeat => class must match */
2035     case OP_CRPLUS: /* These repeats aren't empty */
2036     case OP_CRMINPLUS:
2037     return FALSE;
2038    
2039     case OP_CRRANGE:
2040     case OP_CRMINRANGE:
2041     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2042     break;
2043     }
2044     break;
2045    
2046     /* Opcodes that must match a character */
2047    
2048     case OP_PROP:
2049     case OP_NOTPROP:
2050     case OP_EXTUNI:
2051     case OP_NOT_DIGIT:
2052     case OP_DIGIT:
2053     case OP_NOT_WHITESPACE:
2054     case OP_WHITESPACE:
2055     case OP_NOT_WORDCHAR:
2056     case OP_WORDCHAR:
2057     case OP_ANY:
2058 ph10 345 case OP_ALLANY:
2059 nigel 77 case OP_ANYBYTE:
2060     case OP_CHAR:
2061     case OP_CHARNC:
2062     case OP_NOT:
2063     case OP_PLUS:
2064     case OP_MINPLUS:
2065 nigel 93 case OP_POSPLUS:
2066 nigel 77 case OP_EXACT:
2067     case OP_NOTPLUS:
2068     case OP_NOTMINPLUS:
2069 nigel 93 case OP_NOTPOSPLUS:
2070 nigel 77 case OP_NOTEXACT:
2071     case OP_TYPEPLUS:
2072     case OP_TYPEMINPLUS:
2073 nigel 93 case OP_TYPEPOSPLUS:
2074 nigel 77 case OP_TYPEEXACT:
2075     return FALSE;
2076 ph10 227
2077     /* These are going to continue, as they may be empty, but we have to
2078     fudge the length for the \p and \P cases. */
2079    
2080 ph10 224 case OP_TYPESTAR:
2081     case OP_TYPEMINSTAR:
2082     case OP_TYPEPOSSTAR:
2083     case OP_TYPEQUERY:
2084     case OP_TYPEMINQUERY:
2085     case OP_TYPEPOSQUERY:
2086     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2087 ph10 227 break;
2088    
2089 ph10 224 /* Same for these */
2090 ph10 227
2091 ph10 224 case OP_TYPEUPTO:
2092     case OP_TYPEMINUPTO:
2093     case OP_TYPEPOSUPTO:
2094     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2095     break;
2096 nigel 77
2097     /* End of branch */
2098    
2099     case OP_KET:
2100     case OP_KETRMAX:
2101     case OP_KETRMIN:
2102     case OP_ALT:
2103     return TRUE;
2104    
2105 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2106     MINUPTO, and POSUPTO may be followed by a multibyte character */
2107 nigel 77
2108     #ifdef SUPPORT_UTF8
2109     case OP_STAR:
2110     case OP_MINSTAR:
2111 nigel 93 case OP_POSSTAR:
2112 nigel 77 case OP_QUERY:
2113     case OP_MINQUERY:
2114 nigel 93 case OP_POSQUERY:
2115 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2116     break;
2117 ph10 461
2118 nigel 77 case OP_UPTO:
2119     case OP_MINUPTO:
2120 nigel 93 case OP_POSUPTO:
2121 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2122 nigel 77 break;
2123     #endif
2124 ph10 503
2125 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2126     string. */
2127    
2128     case OP_MARK:
2129     case OP_PRUNE_ARG:
2130     case OP_SKIP_ARG:
2131     code += code[1];
2132 ph10 512 break;
2133 ph10 510
2134 ph10 550 case OP_THEN_ARG:
2135     code += code[1+LINK_SIZE];
2136     break;
2137    
2138 ph10 503 /* None of the remaining opcodes are required to match a character. */
2139 ph10 507
2140 ph10 503 default:
2141 ph10 507 break;
2142 nigel 77 }
2143     }
2144    
2145     return TRUE;
2146     }
2147    
2148    
2149    
2150     /*************************************************
2151     * Scan compiled regex for non-emptiness *
2152     *************************************************/
2153    
2154     /* This function is called to check for left recursive calls. We want to check
2155     the current branch of the current pattern to see if it could match the empty
2156     string. If it could, we must look outwards for branches at other levels,
2157     stopping when we pass beyond the bracket which is the subject of the recursion.
2158    
2159     Arguments:
2160     code points to start of the recursion
2161     endcode points to where to stop (current RECURSE item)
2162     bcptr points to the chain of current (unclosed) branch starts
2163     utf8 TRUE if in UTF-8 mode
2164 ph10 507 cd pointers to tables etc
2165 nigel 77
2166     Returns: TRUE if what is matched could be empty
2167     */
2168    
2169     static BOOL
2170     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2171 ph10 503 BOOL utf8, compile_data *cd)
2172 nigel 77 {
2173 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2174 nigel 77 {
2175 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2176 ph10 475 return FALSE;
2177 nigel 77 bcptr = bcptr->outer;
2178     }
2179     return TRUE;
2180     }
2181    
2182    
2183    
2184     /*************************************************
2185     * Check for POSIX class syntax *
2186     *************************************************/
2187    
2188     /* This function is called when the sequence "[:" or "[." or "[=" is
2189 ph10 295 encountered in a character class. It checks whether this is followed by a
2190 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2191 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2192 nigel 77
2193 ph10 298 Originally, this function only recognized a sequence of letters between the
2194     terminators, but it seems that Perl recognizes any sequence of characters,
2195     though of course unknown POSIX names are subsequently rejected. Perl gives an
2196     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2197     didn't consider this to be a POSIX class. Likewise for [:1234:].
2198 ph10 295
2199 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2200     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2201     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2202     below handles the special case of \], but does not try to do any other escape
2203     processing. This makes it different from Perl for cases such as [:l\ower:]
2204 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2205 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2206 ph10 295 I think.
2207    
2208     Arguments:
2209 nigel 77 ptr pointer to the initial [
2210     endptr where to return the end pointer
2211    
2212     Returns: TRUE or FALSE
2213     */
2214    
2215     static BOOL
2216 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2217 nigel 77 {
2218     int terminator; /* Don't combine these lines; the Solaris cc */
2219     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2220 ph10 295 for (++ptr; *ptr != 0; ptr++)
2221 nigel 77 {
2222 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2223 ph10 298 {
2224 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2225     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2226 ph10 295 {
2227     *endptr = ptr;
2228     return TRUE;
2229 ph10 298 }
2230     }
2231     }
2232 nigel 77 return FALSE;
2233     }
2234    
2235    
2236    
2237    
2238     /*************************************************
2239     * Check POSIX class name *
2240     *************************************************/
2241    
2242     /* This function is called to check the name given in a POSIX-style class entry
2243     such as [:alnum:].
2244    
2245     Arguments:
2246     ptr points to the first letter
2247     len the length of the name
2248    
2249     Returns: a value representing the name, or -1 if unknown
2250     */
2251    
2252     static int
2253     check_posix_name(const uschar *ptr, int len)
2254     {
2255 ph10 240 const char *pn = posix_names;
2256 nigel 77 register int yield = 0;
2257     while (posix_name_lengths[yield] != 0)
2258     {
2259     if (len == posix_name_lengths[yield] &&
2260 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2261 ph10 243 pn += posix_name_lengths[yield] + 1;
2262 nigel 77 yield++;
2263     }
2264     return -1;
2265     }
2266    
2267    
2268     /*************************************************
2269     * Adjust OP_RECURSE items in repeated group *
2270     *************************************************/
2271    
2272     /* OP_RECURSE items contain an offset from the start of the regex to the group
2273     that is referenced. This means that groups can be replicated for fixed
2274     repetition simply by copying (because the recursion is allowed to refer to
2275     earlier groups that are outside the current group). However, when a group is
2276 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2277     inserted before it, after it has been compiled. This means that any OP_RECURSE
2278     items within it that refer to the group itself or any contained groups have to
2279     have their offsets adjusted. That one of the jobs of this function. Before it
2280     is called, the partially compiled regex must be temporarily terminated with
2281     OP_END.
2282 nigel 77
2283 nigel 93 This function has been extended with the possibility of forward references for
2284     recursions and subroutine calls. It must also check the list of such references
2285     for the group we are dealing with. If it finds that one of the recursions in
2286     the current group is on this list, it adjusts the offset in the list, not the
2287     value in the reference (which is a group number).
2288    
2289 nigel 77 Arguments:
2290     group points to the start of the group
2291     adjust the amount by which the group is to be moved
2292     utf8 TRUE in UTF-8 mode
2293     cd contains pointers to tables etc.
2294 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2295 nigel 77
2296     Returns: nothing
2297     */
2298    
2299     static void
2300 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2301     uschar *save_hwm)
2302 nigel 77 {
2303     uschar *ptr = group;
2304 ph10 224
2305 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2306     {
2307 nigel 93 int offset;
2308     uschar *hc;
2309    
2310     /* See if this recursion is on the forward reference list. If so, adjust the
2311     reference. */
2312 ph10 345
2313 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2314     {
2315     offset = GET(hc, 0);
2316     if (cd->start_code + offset == ptr + 1)
2317     {
2318     PUT(hc, 0, offset + adjust);
2319     break;
2320     }
2321     }
2322    
2323     /* Otherwise, adjust the recursion offset if it's after the start of this
2324     group. */
2325    
2326     if (hc >= cd->hwm)
2327     {
2328     offset = GET(ptr, 1);
2329     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2330     }
2331    
2332 nigel 77 ptr += 1 + LINK_SIZE;
2333     }
2334     }
2335    
2336    
2337    
2338     /*************************************************
2339     * Insert an automatic callout point *
2340     *************************************************/
2341    
2342     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2343     callout points before each pattern item.
2344    
2345     Arguments:
2346     code current code pointer
2347     ptr current pattern pointer
2348     cd pointers to tables etc
2349    
2350     Returns: new code pointer
2351     */
2352    
2353     static uschar *
2354     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2355     {
2356     *code++ = OP_CALLOUT;
2357     *code++ = 255;
2358 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2359     PUT(code, LINK_SIZE, 0); /* Default length */
2360 nigel 77 return code + 2*LINK_SIZE;
2361     }
2362    
2363    
2364    
2365     /*************************************************
2366     * Complete a callout item *
2367     *************************************************/
2368    
2369     /* A callout item contains the length of the next item in the pattern, which
2370     we can't fill in till after we have reached the relevant point. This is used
2371     for both automatic and manual callouts.
2372    
2373     Arguments:
2374     previous_callout points to previous callout item
2375     ptr current pattern pointer
2376     cd pointers to tables etc
2377    
2378     Returns: nothing
2379     */
2380    
2381     static void
2382     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2383     {
2384 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2385 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2386     }
2387    
2388    
2389    
2390     #ifdef SUPPORT_UCP
2391     /*************************************************
2392     * Get othercase range *
2393     *************************************************/
2394    
2395     /* This function is passed the start and end of a class range, in UTF-8 mode
2396     with UCP support. It searches up the characters, looking for internal ranges of
2397     characters in the "other" case. Each call returns the next one, updating the
2398     start address.
2399    
2400     Arguments:
2401     cptr points to starting character value; updated
2402     d end value
2403     ocptr where to put start of othercase range
2404     odptr where to put end of othercase range
2405    
2406     Yield: TRUE when range returned; FALSE when no more
2407     */
2408    
2409     static BOOL
2410 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2411     unsigned int *odptr)
2412 nigel 77 {
2413 nigel 93 unsigned int c, othercase, next;
2414 nigel 77
2415     for (c = *cptr; c <= d; c++)
2416 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2417 nigel 77
2418     if (c > d) return FALSE;
2419    
2420     *ocptr = othercase;
2421     next = othercase + 1;
2422    
2423     for (++c; c <= d; c++)
2424     {
2425 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2426 nigel 77 next++;
2427     }
2428    
2429     *odptr = next - 1;
2430     *cptr = c;
2431    
2432     return TRUE;
2433     }
2434 ph10 532
2435    
2436    
2437     /*************************************************
2438     * Check a character and a property *
2439     *************************************************/
2440    
2441     /* This function is called by check_auto_possessive() when a property item
2442     is adjacent to a fixed character.
2443    
2444     Arguments:
2445     c the character
2446     ptype the property type
2447     pdata the data for the type
2448     negated TRUE if it's a negated property (\P or \p{^)
2449 ph10 535
2450 ph10 532 Returns: TRUE if auto-possessifying is OK
2451 ph10 535 */
2452 ph10 532
2453     static BOOL
2454     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2455     {
2456     const ucd_record *prop = GET_UCD(c);
2457     switch(ptype)
2458     {
2459     case PT_LAMP:
2460     return (prop->chartype == ucp_Lu ||
2461     prop->chartype == ucp_Ll ||
2462     prop->chartype == ucp_Lt) == negated;
2463    
2464     case PT_GC:
2465     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2466    
2467     case PT_PC:
2468     return (pdata == prop->chartype) == negated;
2469    
2470     case PT_SC:
2471     return (pdata == prop->script) == negated;
2472    
2473     /* These are specials */
2474    
2475     case PT_ALNUM:
2476     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2477     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2478    
2479     case PT_SPACE: /* Perl space */
2480     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2481     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2482     == negated;
2483    
2484     case PT_PXSPACE: /* POSIX space */
2485     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2486     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2487     c == CHAR_FF || c == CHAR_CR)
2488     == negated;
2489    
2490     case PT_WORD:
2491     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2492     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2493     c == CHAR_UNDERSCORE) == negated;
2494     }
2495 ph10 535 return FALSE;
2496 ph10 532 }
2497 nigel 77 #endif /* SUPPORT_UCP */
2498    
2499    
2500 nigel 93
2501 nigel 77 /*************************************************
2502 nigel 93 * Check if auto-possessifying is possible *
2503     *************************************************/
2504    
2505     /* This function is called for unlimited repeats of certain items, to see
2506     whether the next thing could possibly match the repeated item. If not, it makes
2507     sense to automatically possessify the repeated item.
2508    
2509     Arguments:
2510 ph10 532 previous pointer to the repeated opcode
2511 nigel 93 utf8 TRUE in UTF-8 mode
2512     ptr next character in pattern
2513     options options bits
2514     cd contains pointers to tables etc.
2515    
2516     Returns: TRUE if possessifying is wanted
2517     */
2518    
2519     static BOOL
2520 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2521 ph10 532 int options, compile_data *cd)
2522 nigel 93 {
2523 ph10 532 int c, next;
2524     int op_code = *previous++;
2525 nigel 93
2526     /* Skip whitespace and comments in extended mode */
2527    
2528     if ((options & PCRE_EXTENDED) != 0)
2529     {
2530     for (;;)
2531     {
2532     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2533 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2534 nigel 93 {
2535 ph10 556 ptr++;
2536     while (*ptr != 0)
2537     {
2538 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2539 ph10 556 ptr++;
2540     #ifdef SUPPORT_UTF8
2541     if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2542     #endif
2543     }
2544 nigel 93 }
2545     else break;
2546     }
2547     }
2548    
2549     /* If the next item is one that we can handle, get its value. A non-negative
2550     value is a character, a negative value is an escape value. */
2551    
2552 ph10 391 if (*ptr == CHAR_BACKSLASH)
2553 nigel 93 {
2554     int temperrorcode = 0;
2555     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2556     if (temperrorcode != 0) return FALSE;
2557     ptr++; /* Point after the escape sequence */
2558     }
2559    
2560     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2561     {
2562     #ifdef SUPPORT_UTF8
2563     if (utf8) { GETCHARINC(next, ptr); } else
2564     #endif
2565     next = *ptr++;
2566     }
2567    
2568     else return FALSE;
2569    
2570     /* Skip whitespace and comments in extended mode */
2571    
2572     if ((options & PCRE_EXTENDED) != 0)
2573     {
2574     for (;;)
2575     {
2576     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2577 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2578 nigel 93 {
2579 ph10 556 ptr++;
2580     while (*ptr != 0)
2581     {
2582 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2583 ph10 556 ptr++;
2584     #ifdef SUPPORT_UTF8
2585     if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2586     #endif
2587     }
2588 nigel 93 }
2589     else break;
2590     }
2591     }
2592    
2593     /* If the next thing is itself optional, we have to give up. */
2594    
2595 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2596 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2597     return FALSE;
2598 nigel 93
2599 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2600     the next item is a character. */
2601 nigel 93
2602     if (next >= 0) switch(op_code)
2603     {
2604     case OP_CHAR:
2605 ph10 535 #ifdef SUPPORT_UTF8
2606 ph10 532 GETCHARTEST(c, previous);
2607 ph10 369 #else
2608 ph10 532 c = *previous;
2609 ph10 535 #endif
2610     return c != next;
2611 nigel 93
2612     /* For CHARNC (caseless character) we must check the other case. If we have
2613     Unicode property support, we can use it to test the other case of
2614     high-valued characters. */
2615    
2616     case OP_CHARNC:
2617 ph10 535 #ifdef SUPPORT_UTF8
2618 ph10 532 GETCHARTEST(c, previous);
2619     #else
2620     c = *previous;
2621 ph10 535 #endif
2622 ph10 532 if (c == next) return FALSE;
2623 nigel 93 #ifdef SUPPORT_UTF8
2624     if (utf8)
2625     {
2626     unsigned int othercase;
2627     if (next < 128) othercase = cd->fcc[next]; else
2628     #ifdef SUPPORT_UCP
2629 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2630 nigel 93 #else
2631     othercase = NOTACHAR;
2632     #endif
2633 ph10 532 return (unsigned int)c != othercase;
2634 nigel 93 }
2635     else
2636     #endif /* SUPPORT_UTF8 */
2637 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2638 nigel 93
2639 ph10 532 /* For OP_NOT, its data is always a single-byte character. */
2640 nigel 93
2641     case OP_NOT:
2642 ph10 532 if ((c = *previous) == next) return TRUE;
2643 nigel 93 if ((options & PCRE_CASELESS) == 0) return FALSE;
2644     #ifdef SUPPORT_UTF8
2645     if (utf8)
2646     {
2647     unsigned int othercase;
2648     if (next < 128) othercase = cd->fcc[next]; else
2649     #ifdef SUPPORT_UCP
2650 ph10 349 othercase = UCD_OTHERCASE(next);
2651 nigel 93 #else
2652     othercase = NOTACHAR;
2653     #endif
2654 ph10 532 return (unsigned int)c == othercase;
2655 nigel 93 }
2656     else
2657     #endif /* SUPPORT_UTF8 */
2658 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2659 nigel 93
2660 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2661     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2662    
2663 nigel 93 case OP_DIGIT:
2664     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2665    
2666     case OP_NOT_DIGIT:
2667     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2668    
2669     case OP_WHITESPACE:
2670     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2671    
2672     case OP_NOT_WHITESPACE:
2673     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2674    
2675     case OP_WORDCHAR:
2676     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2677    
2678     case OP_NOT_WORDCHAR:
2679     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2680    
2681 ph10 180 case OP_HSPACE:
2682     case OP_NOT_HSPACE:
2683     switch(next)
2684     {
2685     case 0x09:
2686     case 0x20:
2687     case 0xa0:
2688     case 0x1680:
2689     case 0x180e:
2690     case 0x2000:
2691     case 0x2001:
2692     case 0x2002:
2693     case 0x2003:
2694     case 0x2004:
2695     case 0x2005:
2696     case 0x2006:
2697     case 0x2007:
2698     case 0x2008:
2699     case 0x2009:
2700     case 0x200A:
2701     case 0x202f:
2702     case 0x205f:
2703     case 0x3000:
2704 ph10 528 return op_code == OP_NOT_HSPACE;
2705 ph10 180 default:
2706 ph10 528 return op_code != OP_NOT_HSPACE;
2707 ph10 180 }
2708    
2709 ph10 528 case OP_ANYNL:
2710 ph10 180 case OP_VSPACE:
2711     case OP_NOT_VSPACE:
2712     switch(next)
2713     {
2714     case 0x0a:
2715     case 0x0b:
2716     case 0x0c:
2717     case 0x0d:
2718     case 0x85:
2719     case 0x2028:
2720     case 0x2029:
2721 ph10 528 return op_code == OP_NOT_VSPACE;
2722 ph10 180 default:
2723 ph10 528 return op_code != OP_NOT_VSPACE;
2724 ph10 180 }
2725    
2726 ph10 532 #ifdef SUPPORT_UCP
2727     case OP_PROP:
2728     return check_char_prop(next, previous[0], previous[1], FALSE);
2729 ph10 535
2730 ph10 532 case OP_NOTPROP:
2731     return check_char_prop(next, previous[0], previous[1], TRUE);
2732     #endif
2733    
2734 nigel 93 default:
2735     return FALSE;
2736     }
2737    
2738    
2739 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2740     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2741     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2742     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2743 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2744 nigel 93
2745     switch(op_code)
2746     {
2747     case OP_CHAR:
2748     case OP_CHARNC:
2749 ph10 535 #ifdef SUPPORT_UTF8
2750 ph10 532 GETCHARTEST(c, previous);
2751     #else
2752     c = *previous;
2753 ph10 535 #endif
2754 nigel 93 switch(-next)
2755     {
2756     case ESC_d:
2757 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2758 nigel 93
2759     case ESC_D:
2760 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2761 nigel 93
2762     case ESC_s:
2763 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2764 nigel 93
2765     case ESC_S:
2766 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2767 nigel 93
2768     case ESC_w:
2769 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2770 nigel 93
2771     case ESC_W:
2772 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2773 ph10 182
2774 ph10 180 case ESC_h:
2775     case ESC_H:
2776 ph10 532 switch(c)
2777 ph10 180 {
2778     case 0x09:
2779     case 0x20:
2780     case 0xa0:
2781     case 0x1680:
2782     case 0x180e:
2783     case 0x2000:
2784     case 0x2001:
2785     case 0x2002:
2786     case 0x2003:
2787     case 0x2004:
2788     case 0x2005:
2789     case 0x2006:
2790     case 0x2007:
2791     case 0x2008:
2792     case 0x2009:
2793     case 0x200A:
2794     case 0x202f:
2795     case 0x205f:
2796     case 0x3000:
2797     return -next != ESC_h;
2798     default:
2799     return -next == ESC_h;
2800 ph10 182 }
2801    
2802 ph10 180 case ESC_v:
2803     case ESC_V:
2804 ph10 532 switch(c)
2805 ph10 180 {
2806     case 0x0a:
2807     case 0x0b:
2808     case 0x0c:
2809     case 0x0d:
2810     case 0x85:
2811     case 0x2028:
2812     case 0x2029:
2813     return -next != ESC_v;
2814     default:
2815     return -next == ESC_v;
2816 ph10 182 }
2817 ph10 535
2818     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2819     their substitutions and process them. The result will always be either
2820 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2821 ph10 535
2822 ph10 532 #ifdef SUPPORT_UCP
2823     case ESC_du:
2824     case ESC_DU:
2825     case ESC_wu:
2826     case ESC_WU:
2827     case ESC_su:
2828     case ESC_SU:
2829     {
2830     int temperrorcode = 0;
2831     ptr = substitutes[-next - ESC_DU];
2832     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2833     if (temperrorcode != 0) return FALSE;
2834     ptr++; /* For compatibility */
2835     }
2836 ph10 535 /* Fall through */
2837 nigel 93
2838 ph10 532 case ESC_p:
2839     case ESC_P:
2840     {
2841     int ptype, pdata, errorcodeptr;
2842 ph10 535 BOOL negated;
2843    
2844 ph10 532 ptr--; /* Make ptr point at the p or P */
2845     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2846     if (ptype < 0) return FALSE;
2847     ptr++; /* Point past the final curly ket */
2848 ph10 535
2849 ph10 532 /* If the property item is optional, we have to give up. (When generated
2850     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2851     to the original \d etc. At this point, ptr will point to a zero byte. */
2852 ph10 535
2853 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2854     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2855     return FALSE;
2856 ph10 535
2857 ph10 532 /* Do the property check. */
2858 ph10 535
2859 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2860 ph10 535 }
2861 ph10 532 #endif
2862    
2863 nigel 93 default:
2864     return FALSE;
2865     }
2866    
2867 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2868     well. It means re-organizing the above code so as to get hold of the property
2869     values before switching on the op-code. However, I wonder how many patterns
2870     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2871     these op-codes are never generated.) */
2872    
2873 nigel 93 case OP_DIGIT:
2874 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2875 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2876 nigel 93
2877     case OP_NOT_DIGIT:
2878     return next == -ESC_d;
2879    
2880     case OP_WHITESPACE:
2881 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2882 nigel 93
2883     case OP_NOT_WHITESPACE:
2884 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2885 nigel 93
2886 ph10 180 case OP_HSPACE:
2887 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2888 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2889 ph10 180
2890     case OP_NOT_HSPACE:
2891     return next == -ESC_h;
2892 ph10 182
2893 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2894 ph10 535 case OP_ANYNL:
2895 ph10 182 case OP_VSPACE:
2896 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2897    
2898     case OP_NOT_VSPACE:
2899 ph10 528 return next == -ESC_v || next == -ESC_R;
2900 ph10 180
2901 nigel 93 case OP_WORDCHAR:
2902 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2903 ph10 528 next == -ESC_v || next == -ESC_R;
2904 nigel 93
2905     case OP_NOT_WORDCHAR:
2906     return next == -ESC_w || next == -ESC_d;
2907 ph10 182
2908 nigel 93 default:
2909     return FALSE;
2910     }
2911    
2912     /* Control does not reach here */
2913     }
2914    
2915    
2916    
2917     /*************************************************
2918 nigel 77 * Compile one branch *
2919     *************************************************/
2920    
2921 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2922 nigel 77 changed during the branch, the pointer is used to change the external options
2923 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2924     to find out the amount of memory needed, as well as during the real compile
2925     phase. The value of lengthptr distinguishes the two phases.
2926 nigel 77
2927     Arguments:
2928     optionsptr pointer to the option bits
2929     codeptr points to the pointer to the current code point
2930     ptrptr points to the current pattern pointer
2931     errorcodeptr points to error code variable
2932     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2933     reqbyteptr set to the last literal character required, else < 0
2934     bcptr points to current branch chain
2935     cd contains pointers to tables etc.
2936 nigel 93 lengthptr NULL during the real compile phase
2937     points to length accumulator during pre-compile phase
2938 nigel 77
2939     Returns: TRUE on success
2940     FALSE, with *errorcodeptr set non-zero on error
2941     */
2942    
2943     static BOOL
2944 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2945     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2946     compile_data *cd, int *lengthptr)
2947 nigel 77 {
2948     int repeat_type, op_type;
2949     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2950     int bravalue = 0;
2951     int greedy_default, greedy_non_default;
2952     int firstbyte, reqbyte;
2953     int zeroreqbyte, zerofirstbyte;
2954     int req_caseopt, reqvary, tempreqvary;
2955     int options = *optionsptr;
2956     int after_manual_callout = 0;
2957 nigel 93 int length_prevgroup = 0;
2958 nigel 77 register int c;
2959     register uschar *code = *codeptr;
2960 nigel 93 uschar *last_code = code;
2961     uschar *orig_code = code;
2962 nigel 77 uschar *tempcode;
2963     BOOL inescq = FALSE;
2964     BOOL groupsetfirstbyte = FALSE;
2965     const uschar *ptr = *ptrptr;
2966     const uschar *tempptr;
2967 ph10 518 const uschar *nestptr = NULL;
2968 nigel 77 uschar *previous = NULL;
2969     uschar *previous_callout = NULL;
2970 nigel 93 uschar *save_hwm = NULL;
2971 nigel 77 uschar classbits[32];
2972    
2973     #ifdef SUPPORT_UTF8
2974     BOOL class_utf8;
2975     BOOL utf8 = (options & PCRE_UTF8) != 0;
2976     uschar *class_utf8data;
2977 ph10 300 uschar *class_utf8data_base;
2978 nigel 77 uschar utf8_char[6];
2979     #else
2980     BOOL utf8 = FALSE;
2981 nigel 93 uschar *utf8_char = NULL;
2982 nigel 77 #endif
2983    
2984 ph10 475 #ifdef PCRE_DEBUG
2985 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2986     #endif
2987    
2988 nigel 77 /* Set up the default and non-default settings for greediness */
2989    
2990     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2991     greedy_non_default = greedy_default ^ 1;
2992    
2993     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2994     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2995     matches a non-fixed char first char; reqbyte just remains unset if we never
2996     find one.
2997    
2998     When we hit a repeat whose minimum is zero, we may have to adjust these values
2999     to take the zero repeat into account. This is implemented by setting them to
3000     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3001     item types that can be repeated set these backoff variables appropriately. */
3002    
3003     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3004    
3005     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3006     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3007     value > 255. It is added into the firstbyte or reqbyte variables to record the
3008     case status of the value. This is used only for ASCII characters. */
3009    
3010     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3011    
3012     /* Switch on next character until the end of the branch */
3013    
3014     for (;; ptr++)
3015     {
3016     BOOL negate_class;
3017 ph10 286 BOOL should_flip_negation;
3018 nigel 77 BOOL possessive_quantifier;
3019     BOOL is_quantifier;
3020 nigel 93 BOOL is_recurse;
3021 ph10 180 BOOL reset_bracount;
3022 nigel 77 int class_charcount;
3023     int class_lastchar;
3024     int newoptions;
3025     int recno;
3026 ph10 172 int refsign;
3027 nigel 77 int skipbytes;
3028     int subreqbyte;
3029     int subfirstbyte;
3030 nigel 93 int terminator;
3031 nigel 77 int mclength;
3032     uschar mcbuffer[8];
3033    
3034 nigel 93 /* Get next byte in the pattern */
3035 nigel 77
3036     c = *ptr;
3037 ph10 345
3038 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3039 ph10 518 string. Nesting only happens one level deep. */
3040    
3041     if (c == 0 && nestptr != NULL)
3042     {
3043     ptr = nestptr;
3044     nestptr = NULL;
3045     c = *ptr;
3046     }
3047    
3048 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3049     previous cycle of this loop. */
3050    
3051     if (lengthptr != NULL)
3052     {
3053 ph10 475 #ifdef PCRE_DEBUG
3054 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3055     #endif
3056 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3057 nigel 93 {
3058     *errorcodeptr = ERR52;
3059     goto FAILED;
3060     }
3061    
3062     /* There is at least one situation where code goes backwards: this is the
3063     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3064     the class is simply eliminated. However, it is created first, so we have to
3065     allow memory for it. Therefore, don't ever reduce the length at this point.
3066     */
3067    
3068     if (code < last_code) code = last_code;
3069 ph10 202
3070     /* Paranoid check for integer overflow */
3071    
3072     if (OFLOW_MAX - *lengthptr < code - last_code)
3073     {
3074     *errorcodeptr = ERR20;
3075     goto FAILED;
3076     }
3077    
3078 ph10 530 *lengthptr += (int)(code - last_code);
3079 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3080    
3081     /* If "previous" is set and it is not at the start of the work space, move
3082     it back to there, in order to avoid filling up the work space. Otherwise,
3083     if "previous" is NULL, reset the current code pointer to the start. */
3084    
3085     if (previous != NULL)
3086     {
3087     if (previous > orig_code)
3088     {
3089     memmove(orig_code, previous, code - previous);
3090     code -= previous - orig_code;
3091     previous = orig_code;
3092     }
3093     }
3094     else code = orig_code;
3095    
3096     /* Remember where this code item starts so we can pick up the length
3097     next time round. */
3098    
3099     last_code = code;
3100     }
3101    
3102     /* In the real compile phase, just check the workspace used by the forward
3103     reference list. */
3104    
3105 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3106 nigel 93 {
3107     *errorcodeptr = ERR52;
3108     goto FAILED;
3109     }
3110    
3111 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3112    
3113     if (inescq && c != 0)
3114     {
3115 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3116 nigel 77 {
3117     inescq = FALSE;
3118     ptr++;
3119     continue;
3120     }
3121     else
3122     {
3123     if (previous_callout != NULL)
3124     {
3125 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3126     complete_callout(previous_callout, ptr, cd);
3127 nigel 77 previous_callout = NULL;
3128     }
3129     if ((options & PCRE_AUTO_CALLOUT) != 0)
3130     {
3131     previous_callout = code;
3132     code = auto_callout(code, ptr, cd);
3133     }
3134     goto NORMAL_CHAR;
3135     }
3136     }
3137    
3138     /* Fill in length of a previous callout, except when the next thing is
3139     a quantifier. */
3140    
3141 ph10 392 is_quantifier =
3142 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3143     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3144 nigel 77
3145     if (!is_quantifier && previous_callout != NULL &&
3146     after_manual_callout-- <= 0)
3147     {
3148 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3149     complete_callout(previous_callout, ptr, cd);
3150 nigel 77 previous_callout = NULL;
3151     }
3152    
3153     /* In extended mode, skip white space and comments */
3154    
3155     if ((options & PCRE_EXTENDED) != 0)
3156     {
3157     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3158 ph10 391 if (c == CHAR_NUMBER_SIGN)
3159 nigel 77 {
3160 ph10 556 ptr++;
3161     while (*ptr != 0)
3162 nigel 91 {
3163 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3164 ph10 556 ptr++;
3165     #ifdef SUPPORT_UTF8
3166     if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3167     #endif
3168 nigel 91 }
3169 nigel 93 if (*ptr != 0) continue;
3170    
3171 nigel 91 /* Else fall through to handle end of string */
3172     c = 0;
3173 nigel 77 }
3174     }
3175    
3176     /* No auto callout for quantifiers. */
3177    
3178     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3179     {
3180     previous_callout = code;
3181     code = auto_callout(code, ptr, cd);
3182     }
3183    
3184     switch(c)
3185     {
3186 nigel 93 /* ===================================================================*/
3187     case 0: /* The branch terminates at string end */
3188 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3189     case CHAR_RIGHT_PARENTHESIS:
3190 nigel 77 *firstbyteptr = firstbyte;
3191     *reqbyteptr = reqbyte;
3192     *codeptr = code;
3193     *ptrptr = ptr;
3194 nigel 93 if (lengthptr != NULL)
3195     {
3196 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3197     {
3198     *errorcodeptr = ERR20;
3199     goto FAILED;
3200     }
3201 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3202 nigel 93 DPRINTF((">> end branch\n"));
3203     }
3204 nigel 77 return TRUE;
3205    
3206 nigel 93
3207     /* ===================================================================*/
3208 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3209     the setting of any following char as a first character. */
3210    
3211 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3212 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3213     {
3214     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3215     }
3216     previous = NULL;
3217     *code++ = OP_CIRC;
3218     break;
3219    
3220 ph10 391 case CHAR_DOLLAR_SIGN:
3221 nigel 77 previous = NULL;
3222     *code++ = OP_DOLL;
3223     break;
3224    
3225     /* There can never be a first char if '.' is first, whatever happens about
3226     repeats. The value of reqbyte doesn't change either. */
3227    
3228 ph10 391 case CHAR_DOT:
3229 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3230     zerofirstbyte = firstbyte;
3231     zeroreqbyte = reqbyte;
3232     previous = code;
3233 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3234 nigel 77 break;
3235    
3236 nigel 93
3237     /* ===================================================================*/
3238 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3239     32-byte bitmap of the permitted characters, except in the special case
3240     where there is only one such character. For negated classes, we build the
3241     map as usual, then invert it at the end. However, we use a different opcode
3242     so that data characters > 255 can be handled correctly.
3243 nigel 77
3244     If the class contains characters outside the 0-255 range, a different
3245     opcode is compiled. It may optionally have a bit map for characters < 256,
3246     but those above are are explicitly listed afterwards. A flag byte tells
3247     whether the bitmap is present, and whether this is a negated class or not.
3248 ph10 345
3249 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3250     default (Perl) mode, it is treated as a data character. */
3251 ph10 345
3252 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3253 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3254     {
3255     *errorcodeptr = ERR64;
3256 ph10 345 goto FAILED;
3257 ph10 336 }
3258 ph10 345 goto NORMAL_CHAR;
3259 nigel 77
3260 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3261 nigel 77 previous = code;
3262    
3263     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3264     they are encountered at the top level, so we'll do that too. */
3265    
3266 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3267 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3268 ph10 295 check_posix_syntax(ptr, &tempptr))
3269 nigel 77 {
3270 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3271 nigel 77 goto FAILED;
3272     }
3273    
3274 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3275 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3276 ph10 205 skip them too. This makes for compatibility with Perl. */
3277 ph10 208
3278 ph10 205 negate_class = FALSE;
3279     for (;;)
3280 nigel 77 {
3281     c = *(++ptr);
3282 ph10 391 if (c == CHAR_BACKSLASH)
3283 ph10 205 {
3284 ph10 392 if (ptr[1] == CHAR_E)
3285 ph10 391 ptr++;
3286 ph10 392 else if (strncmp((const char *)ptr+1,
3287     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3288 ph10 391 ptr += 3;
3289 ph10 392 else
3290 ph10 391 break;
3291 ph10 205 }
3292 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3293 ph10 205 negate_class = TRUE;
3294     else break;
3295 ph10 208 }
3296 ph10 345
3297     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3298     an initial ']' is taken as a data character -- the code below handles
3299 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3300     [^] must match any character, so generate OP_ALLANY. */
3301 ph10 345
3302 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3303 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3304 ph10 341 {
3305     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3306     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3307     zerofirstbyte = firstbyte;
3308     break;
3309 ph10 345 }
3310 nigel 77
3311 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3312     negation flag at the end, so that support for characters > 255 works
3313 ph10 264 correctly (they are all included in the class). */
3314    
3315     should_flip_negation = FALSE;
3316    
3317 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3318 nigel 93 of just a single character (as long as it's < 256). However, For higher
3319     valued UTF-8 characters, we don't yet do any optimization. */
3320 nigel 77
3321     class_charcount = 0;
3322     class_lastchar = -1;
3323    
3324 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3325     temporary bit of memory, in case the class contains only 1 character (less
3326     than 256), because in that case the compiled code doesn't use the bit map.
3327     */
3328    
3329     memset(classbits, 0, 32 * sizeof(uschar));
3330    
3331 nigel 77 #ifdef SUPPORT_UTF8
3332     class_utf8 = FALSE; /* No chars >= 256 */
3333 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3334 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3335 nigel 77 #endif
3336    
3337     /* Process characters until ] is reached. By writing this as a "do" it
3338 nigel 93 means that an initial ] is taken as a data character. At the start of the
3339     loop, c contains the first byte of the character. */
3340 nigel 77
3341 nigel 93 if (c != 0) do
3342 nigel 77 {
3343 nigel 93 const uschar *oldptr;
3344    
3345 nigel 77 #ifdef SUPPORT_UTF8
3346     if (utf8 && c > 127)
3347     { /* Braces are required because the */
3348     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3349     }
3350 ph10 535
3351 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3352 ph10 309 data and reset the pointer. This is so that very large classes that
3353 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3354 ph10 309 (which is on the stack). */
3355    
3356 ph10 300 if (lengthptr != NULL)
3357     {
3358     *lengthptr += class_utf8data - class_utf8data_base;
3359 ph10 309 class_utf8data = class_utf8data_base;
3360     }
3361    
3362 nigel 77 #endif
3363    
3364     /* Inside \Q...\E everything is literal except \E */
3365    
3366     if (inescq)
3367     {
3368 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3369 nigel 77 {
3370 nigel 93 inescq = FALSE; /* Reset literal state */
3371     ptr++; /* Skip the 'E' */
3372     continue; /* Carry on with next */
3373 nigel 77 }
3374 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3375 nigel 77 }
3376    
3377     /* Handle POSIX class names. Perl allows a negation extension of the
3378     form [:^name:]. A square bracket that doesn't match the syntax is
3379     treated as a literal. We also recognize the POSIX constructions
3380     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3381     5.6 and 5.8 do. */
3382    
3383 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3384 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3385 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3386 nigel 77 {
3387     BOOL local_negate = FALSE;
3388 nigel 87 int posix_class, taboffset, tabopt;
3389 nigel 77 register const uschar *cbits = cd->cbits;
3390 nigel 87 uschar pbits[32];
3391 nigel 77
3392 ph10 391 if (ptr[1] != CHAR_COLON)
3393 nigel 77 {
3394     *errorcodeptr = ERR31;
3395     goto FAILED;
3396     }
3397    
3398     ptr += 2;
3399 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3400 nigel 77 {
3401     local_negate = TRUE;
3402 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3403 nigel 77 ptr++;
3404     }
3405    
3406 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3407 nigel 77 if (posix_class < 0)
3408     {
3409     *errorcodeptr = ERR30;
3410     goto FAILED;
3411     }
3412    
3413     /* If matching is caseless, upper and lower are converted to
3414     alpha. This relies on the fact that the class table starts with
3415     alpha, lower, upper as the first 3 entries. */
3416    
3417     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3418     posix_class = 0;
3419 ph10 535
3420     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3421 ph10 518 different escape sequences that use Unicode properties. */
3422 ph10 535
3423 ph10 518 #ifdef SUPPORT_UCP
3424     if ((options & PCRE_UCP) != 0)
3425     {
3426     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3427     if (posix_substitutes[pc] != NULL)
3428     {
3429 ph10 535 nestptr = tempptr + 1;
3430 ph10 518 ptr = posix_substitutes[pc] - 1;
3431 ph10 535 continue;
3432     }
3433     }
3434     #endif
3435 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3436     chunk of local store because we may be adding and subtracting from it,
3437     and we don't want to subtract bits that may be in the main map already.
3438     At the end we or the result into the bit map that is being built. */
3439 nigel 77
3440     posix_class *= 3;
3441 nigel 87
3442     /* Copy in the first table (always present) */
3443    
3444     memcpy(pbits, cbits + posix_class_maps[posix_class],
3445     32 * sizeof(uschar));
3446    
3447     /* If there is a second table, add or remove it as required. */
3448    
3449     taboffset = posix_class_maps[posix_class + 1];
3450     tabopt = posix_class_maps[posix_class + 2];
3451    
3452     if (taboffset >= 0)
3453 nigel 77 {
3454 nigel 87 if (tabopt >= 0)
3455     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3456 nigel 77 else
3457 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3458 nigel 77 }
3459    
3460 nigel 87 /* Not see if we need to remove any special characters. An option
3461     value of 1 removes vertical space and 2 removes underscore. */
3462    
3463     if (tabopt < 0) tabopt = -tabopt;
3464     if (tabopt == 1) pbits[1] &= ~0x3c;
3465     else if (tabopt == 2) pbits[11] &= 0x7f;
3466    
3467     /* Add the POSIX table or its complement into the main table that is
3468     being built and we are done. */
3469    
3470     if (local_negate)
3471     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3472     else
3473     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3474    
3475 nigel 77 ptr = tempptr + 1;
3476     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3477     continue; /* End of POSIX syntax handling */
3478     }
3479    
3480     /* Backslash may introduce a single character, or it may introduce one
3481 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3482 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3483     assume that other escapes have more than one character in them, so set
3484     class_charcount bigger than one. Unrecognized escapes fall through and
3485     are either treated as literal characters (by default), or are faulted if
3486     PCRE_EXTRA is set. */
3487 nigel 77
3488 ph10 391 if (c == CHAR_BACKSLASH)
3489 nigel 77 {
3490 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3491     if (*errorcodeptr != 0) goto FAILED;
3492 nigel 77
3493 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3494 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3495     {
3496 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3497 nigel 77 {
3498     ptr += 2; /* avoid empty string */
3499     }
3500     else inescq = TRUE;
3501     continue;
3502     }
3503 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3504 nigel 77
3505     if (c < 0)
3506     {
3507     register const uschar *cbits = cd->cbits;
3508     class_charcount += 2; /* Greater than 1 is what matters */
3509 nigel 93
3510 ph10 518 switch (-c)
3511 nigel 77 {
3512 ph10 518 #ifdef SUPPORT_UCP
3513     case ESC_du: /* These are the values given for \d etc */
3514     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3515     case ESC_wu: /* escape sequence with an appropriate \p */
3516     case ESC_WU: /* or \P to test Unicode properties instead */
3517     case ESC_su: /* of the default ASCII testing. */
3518     case ESC_SU:
3519     nestptr = ptr;
3520     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3521 ph10 535 class_charcount -= 2; /* Undo! */
3522 ph10 518 continue;
3523     #endif
3524 nigel 77 case ESC_d:
3525     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3526     continue;
3527    
3528     case ESC_D:
3529 ph10 286 should_flip_negation = TRUE;
3530 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3531     continue;
3532    
3533     case ESC_w:
3534     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3535     continue;
3536    
3537     case ESC_W:
3538 ph10 286 should_flip_negation = TRUE;
3539 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3540     continue;
3541    
3542 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3543     if it was previously set by something earlier in the character
3544     class. */
3545    
3546 nigel 77 case ESC_s:
3547 ph10 552 classbits[0] |= cbits[cbit_space];
3548     classbits[1] |= cbits[cbit_space+1] & ~0x08;
3549     for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3550 nigel 77 continue;
3551    
3552     case ESC_S:
3553 ph10 286 should_flip_negation = TRUE;
3554 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3555     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3556     continue;
3557    
3558 ph10 518 case ESC_h:
3559 ph10 178 SETBIT(classbits, 0x09); /* VT */
3560     SETBIT(classbits, 0x20); /* SPACE */
3561 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3562 ph10 178 #ifdef SUPPORT_UTF8
3563     if (utf8)
3564 ph10 180 {
3565 ph10 178 class_utf8 = TRUE;
3566     *class_utf8data++ = XCL_SINGLE;
3567 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3568 ph10 178 *class_utf8data++ = XCL_SINGLE;
3569 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3570     *class_utf8data++ = XCL_RANGE;
3571     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3572     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3573 ph10 178 *class_utf8data++ = XCL_SINGLE;
3574 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3575 ph10 178 *class_utf8data++ = XCL_SINGLE;
3576 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3577 ph10 178 *class_utf8data++ = XCL_SINGLE;
3578 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3579     }
3580     #endif
3581     continue;
3582 nigel 93
3583 ph10 518 case ESC_H:
3584 ph10 178 for (c = 0; c < 32; c++)
3585     {
3586     int x = 0xff;
3587     switch (c)
3588 ph10 180 {
3589 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3590     case 0x20/8: x ^= 1 << (0x20%8); break;
3591     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3592     default: break;
3593     }
3594     classbits[c] |= x;
3595 ph10 180 }
3596    
3597 ph10 178 #ifdef SUPPORT_UTF8
3598     if (utf8)
3599 ph10 180 {
3600 ph10 178 class_utf8 = TRUE;
3601 ph10 180 *class_utf8data++ = XCL_RANGE;
3602     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3603     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3604     *class_utf8data++ = XCL_RANGE;
3605     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3606     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3607     *class_utf8data++ = XCL_RANGE;
3608     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3609     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3610     *class_utf8data++ = XCL_RANGE;
3611     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3612     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3613     *class_utf8data++ = XCL_RANGE;
3614     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3615     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3616     *class_utf8data++ = XCL_RANGE;
3617     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3618     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3619     *class_utf8data++ = XCL_RANGE;
3620     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3621     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3622     }
3623     #endif
3624     continue;
3625 ph10 178
3626 ph10 518 case ESC_v:
3627 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3628     SETBIT(classbits, 0x0b); /* VT */
3629 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3630     SETBIT(classbits, 0x0d); /* CR */
3631     SETBIT(classbits, 0x85); /* NEL */
3632 ph10 178 #ifdef SUPPORT_UTF8
3633     if (utf8)
3634 ph10 180 {
3635 ph10 178 class_utf8 = TRUE;
3636 ph10 180 *class_utf8data++ = XCL_RANGE;
3637     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3638     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3639     }
3640     #endif
3641     continue;
3642 ph10 178
3643 ph10 518 case ESC_V:
3644 ph10 178 for (c = 0; c < 32; c++)
3645     {
3646     int x = 0xff;
3647     switch (c)
3648 ph10 180 {
3649 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3650     x ^= 1 << (0x0b%8);
3651     x ^= 1 << (0x0c%8);
3652 ph10 180 x ^= 1 << (0x0d%8);
3653 ph10 178 break;
3654     case 0x85/8: x ^= 1 << (0x85%8); break;
3655     default: break;
3656     }
3657     classbits[c] |= x;
3658 ph10 180 }
3659    
3660 ph10 178 #ifdef SUPPORT_UTF8
3661     if (utf8)
3662 ph10 180 {
3663 ph10 178 class_utf8 = TRUE;
3664 ph10 180 *class_utf8data++ = XCL_RANGE;
3665     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3666     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3667     *class_utf8data++ = XCL_RANGE;
3668     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3669     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3670     }
3671     #endif
3672     continue;
3673 ph10 178
3674 nigel 77 #ifdef SUPPORT_UCP
3675 ph10 518 case ESC_p:
3676     case ESC_P:
3677     {
3678     BOOL negated;
3679     int pdata;
3680     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3681     if (ptype < 0) goto FAILED;
3682     class_utf8 = TRUE;
3683     *class_utf8data++ = ((-c == ESC_p) != negated)?
3684     XCL_PROP : XCL_NOTPROP;
3685     *class_utf8data++ = ptype;
3686     *class_utf8data++ = pdata;
3687     class_charcount -= 2; /* Not a < 256 character */
3688     continue;
3689     }
3690 nigel 77 #endif
3691 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3692     strict mode. By default, for compatibility with Perl, they are
3693     treated as literals. */
3694 nigel 77
3695 ph10 518 default:
3696     if ((options & PCRE_EXTRA) != 0)
3697     {
3698     *errorcodeptr = ERR7;
3699     goto FAILED;
3700     }
3701     class_charcount -= 2; /* Undo the default count from above */
3702     c = *ptr; /* Get the final character and fall through */
3703     break;
3704 nigel 93 }
3705 nigel 77 }
3706    
3707     /* Fall through if we have a single character (c >= 0). This may be
3708 nigel 93 greater than 256 in UTF-8 mode. */
3709 nigel 77
3710     } /* End of backslash handling */
3711    
3712     /* A single character may be followed by '-' to form a range. However,
3713     Perl does not permit ']' to be the end of the range. A '-' character
3714 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3715     entirely. The code for handling \Q and \E is messy. */
3716 nigel 77
3717 nigel 93 CHECK_RANGE:
3718 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3719 nigel 77 {
3720 nigel 93 inescq = FALSE;
3721     ptr += 2;
3722     }
3723    
3724     oldptr = ptr;
3725 ph10 231
3726 ph10 230 /* Remember \r or \n */
3727 ph10 231
3728 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3729 ph10 231
3730 ph10 230 /* Check for range */
3731 nigel 93
3732 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3733 nigel 93 {
3734 nigel 77 int d;
3735     ptr += 2;
3736 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3737 nigel 77
3738 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3739     mode. */
3740    
3741 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3742 nigel 93 {
3743     ptr += 2;
3744 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3745 ph10 391 { ptr += 2; continue; }
3746 nigel 93 inescq = TRUE;
3747     break;
3748     }
3749    
3750 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3751 nigel 93 {
3752     ptr = oldptr;
3753     goto LONE_SINGLE_CHARACTER;
3754     }
3755    
3756 nigel 77 #ifdef SUPPORT_UTF8
3757     if (utf8)
3758     { /* Braces are required because the */
3759     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3760     }
3761     else
3762     #endif
3763     d = *ptr; /* Not UTF-8 mode */
3764    
3765     /* The second part of a range can be a single-character escape, but
3766     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3767     in such circumstances. */
3768    
3769 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3770 nigel 77 {
3771 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3772     if (*errorcodeptr != 0) goto FAILED;
3773 nigel 77
3774 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3775 nigel 77
3776     if (d < 0)
3777     {
3778 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3779 nigel 77 {
3780 nigel 93 ptr = oldptr;
3781 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3782     }
3783     }
3784     }
3785    
3786 nigel 93 /* Check that the two values are in the correct order. Optimize
3787     one-character ranges */
3788 nigel 77
3789 nigel 93 if (d < c)
3790     {
3791     *errorcodeptr = ERR8;
3792     goto FAILED;
3793     }
3794    
3795 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3796    
3797 ph10 230 /* Remember \r or \n */
3798 ph10 231
3799 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3800 ph10 231
3801 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3802     matching, we have to use an XCLASS with extra data items. Caseless
3803     matching for characters > 127 is available only if UCP support is
3804     available. */
3805    
3806     #ifdef SUPPORT_UTF8
3807     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3808     {
3809     class_utf8 = TRUE;
3810    
3811     /* With UCP support, we can find the other case equivalents of
3812     the relevant characters. There may be several ranges. Optimize how
3813     they fit with the basic range. */
3814    
3815     #ifdef SUPPORT_UCP
3816     if ((options & PCRE_CASELESS) != 0)
3817     {
3818 nigel 93 unsigned int occ, ocd;
3819     unsigned int cc = c;
3820     unsigned int origd = d;
3821 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3822     {
3823 ph10 180 if (occ >= (unsigned int)c &&
3824     ocd <= (unsigned int)d)
3825 ph10 176 continue; /* Skip embedded ranges */
3826 nigel 77
3827 ph10 180 if (occ < (unsigned int)c &&
3828 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3829 nigel 77 { /* if there is overlap, */
3830     c = occ; /* noting that if occ < c */
3831     continue; /* we can't have ocd > d */
3832     } /* because a subrange is */
3833 ph10 180 if (ocd > (unsigned int)d &&
3834 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3835 nigel 77 { /* the basic range. */
3836     d = ocd;
3837     continue;
3838     }
3839    
3840     if (occ == ocd)
3841     {
3842     *class_utf8data++ = XCL_SINGLE;
3843     }
3844     else
3845     {
3846     *class_utf8data++ = XCL_RANGE;
3847     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3848     }
3849     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3850     }
3851     }
3852     #endif /* SUPPORT_UCP */
3853    
3854     /* Now record the original range, possibly modified for UCP caseless
3855     overlapping ranges. */
3856    
3857     *class_utf8data++ = XCL_RANGE;
3858     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3859     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3860    
3861     /* With UCP support, we are done. Without UCP support, there is no
3862     caseless matching for UTF-8 characters > 127; we can use the bit map
3863     for the smaller ones. */
3864    
3865     #ifdef SUPPORT_UCP
3866     continue; /* With next character in the class */
3867     #else
3868     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3869    
3870     /* Adjust upper limit and fall through to set up the map */
3871    
3872     d = 127;
3873    
3874     #endif /* SUPPORT_UCP */
3875     }
3876     #endif /* SUPPORT_UTF8 */
3877    
3878     /* We use the bit map for all cases when not in UTF-8 mode; else
3879     ranges that lie entirely within 0-127 when there is UCP support; else
3880     for partial ranges without UCP support. */
3881    
3882 nigel 93 class_charcount += d - c + 1;
3883     class_lastchar = d;
3884    
3885     /* We can save a bit of time by skipping this in the pre-compile. */
3886    
3887     if (lengthptr == NULL) for (; c <= d; c++)
3888 nigel 77 {
3889     classbits[c/8] |= (1 << (c&7));
3890     if ((options & PCRE_CASELESS) != 0)
3891     {
3892     int uc = cd->fcc[c]; /* flip case */
3893     classbits[uc/8] |= (1 << (uc&7));
3894     }
3895     }
3896    
3897     continue; /* Go get the next char in the class */
3898     }
3899    
3900     /* Handle a lone single character - we can get here for a normal
3901     non-escape char, or after \ that introduces a single character or for an
3902     apparent range that isn't. */
3903    
3904     LONE_SINGLE_CHARACTER:
3905 ph10 231
3906 nigel 77 /* Handle a character that cannot go in the bit map */
3907    
3908     #ifdef SUPPORT_UTF8
3909     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3910     {
3911     class_utf8 = TRUE;
3912     *class_utf8data++ = XCL_SINGLE;
3913     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3914    
3915     #ifdef SUPPORT_UCP
3916     if ((options & PCRE_CASELESS) != 0)
3917     {
3918 nigel 93 unsigned int othercase;
3919 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3920 nigel 77 {
3921     *class_utf8data++ = XCL_SINGLE;
3922     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3923     }
3924     }
3925     #endif /* SUPPORT_UCP */
3926    
3927     }
3928     else
3929     #endif /* SUPPORT_UTF8 */
3930    
3931     /* Handle a single-byte character */
3932     {
3933     classbits[c/8] |= (1 << (c&7));
3934     if ((options & PCRE_CASELESS) != 0)
3935     {
3936     c = cd->fcc[c]; /* flip case */
3937     classbits[c/8] |= (1 << (c&7));
3938     }
3939     class_charcount++;
3940     class_lastchar = c;
3941     }
3942     }
3943    
3944 ph10 518 /* Loop until ']' reached. This "while" is the end of the "do" far above.
3945     If we are at the end of an internal nested string, revert to the outer
3946     string. */
3947 nigel 77
3948 ph10