/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 605 - (hide annotations) (download)
Fri Jun 3 18:18:30 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 238607 byte(s)
Make pcre_study() more robust against update omissions; fix ONCE oversight.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 510 ;
413 nigel 77
414     /* Table to identify digits and hex digits. This is used when compiling
415     patterns. Note that the tables in chartables are dependent on the locale, and
416     may mark arbitrary characters as digits - but the PCRE compiling code expects
417     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
418     a private table here. It costs 256 bytes, but it is a lot faster than doing
419     character value tests (at least in some simple cases I timed), and in some
420     applications one wants PCRE to compile efficiently as well as match
421     efficiently.
422    
423     For convenience, we use the same bit definitions as in chartables:
424    
425     0x04 decimal digit
426     0x08 hexadecimal digit
427    
428     Then we can use ctype_digit and ctype_xdigit in the code. */
429    
430 ph10 392 #ifndef EBCDIC
431 ph10 391
432 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
433 ph10 391 UTF-8 mode. */
434    
435 nigel 77 static const unsigned char digitab[] =
436     {
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
443     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
444     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
445     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
449     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
469    
470 ph10 392 #else
471 ph10 391
472     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
473    
474 nigel 77 static const unsigned char digitab[] =
475     {
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
487 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
488 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
492     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
500     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
506     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
507     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
508    
509     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
510     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
511     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
512     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
513     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
514     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
518     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
519     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
521 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
522 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
523     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
526     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
527     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
528     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
529     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
530     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
531     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
532     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
533     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
534     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
535     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
536     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
537     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
538     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
539     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
540     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
541     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
542     #endif
543    
544    
545     /* Definition to allow mutual recursion */
546    
547     static BOOL
548 ph10 604 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,
549     int *, branch_chain *, compile_data *, int *);
550 nigel 77
551    
552    
553     /*************************************************
554 ph10 240 * Find an error text *
555     *************************************************/
556    
557 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
558     some of the text is of unknown length, we can't use a table of offsets.
559     Instead, just count through the strings. This is not a performance issue
560 ph10 240 because it happens only when there has been a compilation error.
561    
562     Argument: the error number
563     Returns: pointer to the error string
564     */
565    
566     static const char *
567     find_error_text(int n)
568     {
569     const char *s = error_texts;
570 ph10 507 for (; n > 0; n--)
571 ph10 499 {
572     while (*s++ != 0) {};
573     if (*s == 0) return "Error text not found (please report)";
574 ph10 507 }
575 ph10 240 return s;
576     }
577    
578    
579     /*************************************************
580 nigel 77 * Handle escapes *
581     *************************************************/
582    
583     /* This function is called when a \ has been encountered. It either returns a
584     positive value for a simple escape such as \n, or a negative value which
585 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
586     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
587     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
588     ptr is pointing at the \. On exit, it is on the final character of the escape
589     sequence.
590 nigel 77
591     Arguments:
592     ptrptr points to the pattern position pointer
593     errorcodeptr points to the errorcode variable
594     bracount number of previous extracting brackets
595     options the options bits
596     isclass TRUE if inside a character class
597    
598     Returns: zero or positive => a data character
599     negative => a special escape sequence
600 ph10 213 on error, errorcodeptr is set
601 nigel 77 */
602    
603     static int
604     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
605     int options, BOOL isclass)
606     {
607 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
608     const uschar *ptr = *ptrptr + 1;
609 nigel 77 int c, i;
610    
611 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
612     ptr--; /* Set pointer back to the last byte */
613    
614 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
615    
616     if (c == 0) *errorcodeptr = ERR1;
617    
618 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
619     in a table. A non-zero result is something that can be returned immediately.
620 nigel 77 Otherwise further processing may be required. */
621    
622 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
623     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
624     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
625 nigel 77
626 ph10 97 #else /* EBCDIC coding */
627 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
628 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
629     #endif
630    
631     /* Escapes that need further processing, or are illegal. */
632    
633     else
634     {
635     const uschar *oldptr;
636 nigel 93 BOOL braced, negated;
637    
638 nigel 77 switch (c)
639     {
640     /* A number of Perl escapes are not handled by PCRE. We give an explicit
641     error. */
642    
643 ph10 391 case CHAR_l:
644     case CHAR_L:
645     case CHAR_u:
646     case CHAR_U:
647 nigel 77 *errorcodeptr = ERR37;
648     break;
649    
650 ph10 333 /* \g must be followed by one of a number of specific things:
651 ph10 345
652 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
653     backreference. If negative, it is a relative backreference. This is a Perl
654     5.10 feature.
655 ph10 345
656 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
657     is part of Perl's movement towards a unified syntax for back references. As
658     this is synonymous with \k{name}, we fudge it up by pretending it really
659     was \k.
660 ph10 345
661     (3) For Oniguruma compatibility we also support \g followed by a name or a
662     number either in angle brackets or in single quotes. However, these are
663     (possibly recursive) subroutine calls, _not_ backreferences. Just return
664 ph10 333 the -ESC_g code (cf \k). */
665 nigel 93
666 ph10 391 case CHAR_g:
667     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
668 ph10 333 {
669     c = -ESC_g;
670 ph10 345 break;
671     }
672 ph10 333
673     /* Handle the Perl-compatible cases */
674 ph10 345
675 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
676 nigel 93 {
677 ph10 171 const uschar *p;
678 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
679     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
680     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
681 ph10 171 {
682     c = -ESC_k;
683     break;
684 ph10 172 }
685 nigel 93 braced = TRUE;
686     ptr++;
687     }
688     else braced = FALSE;
689    
690 ph10 391 if (ptr[1] == CHAR_MINUS)
691 nigel 93 {
692     negated = TRUE;
693     ptr++;
694     }
695     else negated = FALSE;
696    
697     c = 0;
698     while ((digitab[ptr[1]] & ctype_digit) != 0)
699 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
700 ph10 220
701 ph10 333 if (c < 0) /* Integer overflow */
702 ph10 213 {
703     *errorcodeptr = ERR61;
704     break;
705 ph10 220 }
706 ph10 345
707 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
708 nigel 93 {
709     *errorcodeptr = ERR57;
710 ph10 213 break;
711 nigel 93 }
712 ph10 345
713 ph10 333 if (c == 0)
714     {
715     *errorcodeptr = ERR58;
716     break;
717 ph10 345 }
718 nigel 93
719     if (negated)
720     {
721     if (c > bracount)
722     {
723     *errorcodeptr = ERR15;
724 ph10 213 break;
725 nigel 93 }
726     c = bracount - (c - 1);
727     }
728    
729     c = -(ESC_REF + c);
730     break;
731    
732 nigel 77 /* The handling of escape sequences consisting of a string of digits
733     starting with one that is not zero is not straightforward. By experiment,
734     the way Perl works seems to be as follows:
735    
736     Outside a character class, the digits are read as a decimal number. If the
737     number is less than 10, or if there are that many previous extracting
738     left brackets, then it is a back reference. Otherwise, up to three octal
739     digits are read to form an escaped byte. Thus \123 is likely to be octal
740     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
741     value is greater than 377, the least significant 8 bits are taken. Inside a
742     character class, \ followed by a digit is always an octal number. */
743    
744 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
745     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
746 nigel 77
747     if (!isclass)
748     {
749     oldptr = ptr;
750 ph10 391 c -= CHAR_0;
751 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
752 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
753 ph10 333 if (c < 0) /* Integer overflow */
754 ph10 213 {
755     *errorcodeptr = ERR61;
756 ph10 220 break;
757     }
758 nigel 77 if (c < 10 || c <= bracount)
759     {
760     c = -(ESC_REF + c);
761     break;
762     }
763     ptr = oldptr; /* Put the pointer back and fall through */
764     }
765    
766     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
767     generates a binary zero byte and treats the digit as a following literal.
768     Thus we have to pull back the pointer by one. */
769    
770 ph10 391 if ((c = *ptr) >= CHAR_8)
771 nigel 77 {
772     ptr--;
773     c = 0;
774     break;
775     }
776    
777     /* \0 always starts an octal number, but we may drop through to here with a
778 nigel 91 larger first octal digit. The original code used just to take the least
779     significant 8 bits of octal numbers (I think this is what early Perls used
780     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
781     than 3 octal digits. */
782 nigel 77
783 ph10 391 case CHAR_0:
784     c -= CHAR_0;
785     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
786     c = c * 8 + *(++ptr) - CHAR_0;
787 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
788 nigel 77 break;
789    
790 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
791     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
792     treated as a data character. */
793 nigel 77
794 ph10 391 case CHAR_x:
795     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
796 nigel 77 {
797     const uschar *pt = ptr + 2;
798 nigel 87 int count = 0;
799    
800 nigel 77 c = 0;
801     while ((digitab[*pt] & ctype_xdigit) != 0)
802     {
803 nigel 87 register int cc = *pt++;
804 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
805 nigel 77 count++;
806 nigel 87
807 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
808     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
809     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
810 ph10 97 #else /* EBCDIC coding */
811 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
812     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
813 nigel 77 #endif
814     }
815 nigel 87
816 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
817 nigel 77 {
818 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
819 nigel 77 ptr = pt;
820     break;
821     }
822 nigel 87
823 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
824     recognize this construct; fall through to the normal \x handling. */
825     }
826    
827 nigel 87 /* Read just a single-byte hex-defined char */
828 nigel 77
829     c = 0;
830     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
831     {
832 ph10 391 int cc; /* Some compilers don't like */
833     cc = *(++ptr); /* ++ in initializers */
834     #ifndef EBCDIC /* ASCII/UTF-8 coding */
835     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
836     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
837 ph10 97 #else /* EBCDIC coding */
838 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
839     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
840 nigel 77 #endif
841     }
842     break;
843    
844 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
845 ph10 574 An error is given if the byte following \c is not an ASCII character. This
846     coding is ASCII-specific, but then the whole concept of \cx is
847 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
848 nigel 77
849 ph10 391 case CHAR_c:
850 nigel 77 c = *(++ptr);
851     if (c == 0)
852     {
853     *errorcodeptr = ERR2;
854 ph10 213 break;
855 nigel 77 }
856 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
857     if (c > 127) /* Excludes all non-ASCII in either mode */
858     {
859     *errorcodeptr = ERR68;
860 ph10 579 break;
861     }
862 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
863 nigel 77 c ^= 0x40;
864 ph10 574 #else /* EBCDIC coding */
865 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
866 nigel 77 c ^= 0xC0;
867     #endif
868     break;
869    
870     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
871 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
872     otherwise, for Perl compatibility, it is a literal. This code looks a bit
873     odd, but there used to be some cases other than the default, and there may
874     be again in future, so I haven't "optimized" it. */
875 nigel 77
876     default:
877     if ((options & PCRE_EXTRA) != 0) switch(c)
878     {
879     default:
880     *errorcodeptr = ERR3;
881     break;
882     }
883     break;
884     }
885     }
886 ph10 518
887     /* Perl supports \N{name} for character names, as well as plain \N for "not
888 ph10 514 newline". PCRE does not support \N{name}. */
889 nigel 77
890 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
891 ph10 518 *errorcodeptr = ERR37;
892 ph10 514
893 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
894    
895     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
896     c -= (ESC_DU - ESC_D);
897    
898     /* Set the pointer to the final character before returning. */
899    
900 nigel 77 *ptrptr = ptr;
901     return c;
902     }
903    
904    
905    
906     #ifdef SUPPORT_UCP
907     /*************************************************
908     * Handle \P and \p *
909     *************************************************/
910    
911     /* This function is called after \P or \p has been encountered, provided that
912     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
913     pointing at the P or p. On exit, it is pointing at the final character of the
914     escape sequence.
915    
916     Argument:
917     ptrptr points to the pattern position pointer
918     negptr points to a boolean that is set TRUE for negation else FALSE
919 nigel 87 dptr points to an int that is set to the detailed property value
920 nigel 77 errorcodeptr points to the error code variable
921    
922 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
923 nigel 77 */
924    
925     static int
926 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
927 nigel 77 {
928     int c, i, bot, top;
929     const uschar *ptr = *ptrptr;
930 nigel 87 char name[32];
931 nigel 77
932     c = *(++ptr);
933     if (c == 0) goto ERROR_RETURN;
934    
935     *negptr = FALSE;
936    
937 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
938     negation. */
939 nigel 77
940 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
941 nigel 77 {
942 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
943 nigel 77 {
944     *negptr = TRUE;
945     ptr++;
946     }
947 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
948 nigel 77 {
949     c = *(++ptr);
950     if (c == 0) goto ERROR_RETURN;
951 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
952 nigel 77 name[i] = c;
953     }
954 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
955 nigel 77 name[i] = 0;
956     }
957    
958     /* Otherwise there is just one following character */
959    
960     else
961     {
962     name[0] = c;
963     name[1] = 0;
964     }
965    
966     *ptrptr = ptr;
967    
968     /* Search for a recognized property name using binary chop */
969    
970     bot = 0;
971     top = _pcre_utt_size;
972    
973     while (bot < top)
974     {
975 nigel 87 i = (bot + top) >> 1;
976 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
977 nigel 87 if (c == 0)
978     {
979     *dptr = _pcre_utt[i].value;
980     return _pcre_utt[i].type;
981     }
982 nigel 77 if (c > 0) bot = i + 1; else top = i;
983     }
984    
985     *errorcodeptr = ERR47;
986     *ptrptr = ptr;
987     return -1;
988    
989     ERROR_RETURN:
990     *errorcodeptr = ERR46;
991     *ptrptr = ptr;
992     return -1;
993     }
994     #endif
995    
996    
997    
998    
999     /*************************************************
1000     * Check for counted repeat *
1001     *************************************************/
1002    
1003     /* This function is called when a '{' is encountered in a place where it might
1004     start a quantifier. It looks ahead to see if it really is a quantifier or not.
1005     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1006     where the ddds are digits.
1007    
1008     Arguments:
1009     p pointer to the first char after '{'
1010    
1011     Returns: TRUE or FALSE
1012     */
1013    
1014     static BOOL
1015     is_counted_repeat(const uschar *p)
1016     {
1017     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018     while ((digitab[*p] & ctype_digit) != 0) p++;
1019 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1020 nigel 77
1021 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1022     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1023 nigel 77
1024     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1025     while ((digitab[*p] & ctype_digit) != 0) p++;
1026    
1027 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1028 nigel 77 }
1029    
1030    
1031    
1032     /*************************************************
1033     * Read repeat counts *
1034     *************************************************/
1035    
1036     /* Read an item of the form {n,m} and return the values. This is called only
1037     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1038     so the syntax is guaranteed to be correct, but we need to check the values.
1039    
1040     Arguments:
1041     p pointer to first char after '{'
1042     minp pointer to int for min
1043     maxp pointer to int for max
1044     returned as -1 if no max
1045     errorcodeptr points to error code variable
1046    
1047     Returns: pointer to '}' on success;
1048     current ptr on error, with errorcodeptr set non-zero
1049     */
1050    
1051     static const uschar *
1052     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1053     {
1054     int min = 0;
1055     int max = -1;
1056    
1057 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1058     an integer overflow. */
1059    
1060 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1061 nigel 81 if (min < 0 || min > 65535)
1062     {
1063     *errorcodeptr = ERR5;
1064     return p;
1065     }
1066 nigel 77
1067 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1068     Also, max must not be less than min. */
1069    
1070 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1071 nigel 77 {
1072 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1073 nigel 77 {
1074     max = 0;
1075 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1076 nigel 81 if (max < 0 || max > 65535)
1077     {
1078     *errorcodeptr = ERR5;
1079     return p;
1080     }
1081 nigel 77 if (max < min)
1082     {
1083     *errorcodeptr = ERR4;
1084     return p;
1085     }
1086     }
1087     }
1088    
1089 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1090     '}'. */
1091 nigel 77
1092 nigel 81 *minp = min;
1093     *maxp = max;
1094 nigel 77 return p;
1095     }
1096    
1097    
1098    
1099     /*************************************************
1100 ph10 408 * Subroutine for finding forward reference *
1101 nigel 91 *************************************************/
1102    
1103 ph10 408 /* This recursive function is called only from find_parens() below. The
1104     top-level call starts at the beginning of the pattern. All other calls must
1105     start at a parenthesis. It scans along a pattern's text looking for capturing
1106 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1107     name it is given, it returns its number. Alternatively, if the name is NULL, it
1108 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1109     track of subpatterns that reset the capturing group numbers - the (?| feature.
1110 nigel 91
1111 ph10 578 This function was originally called only from the second pass, in which we know
1112     that if (?< or (?' or (?P< is encountered, the name will be correctly
1113     terminated because that is checked in the first pass. There is now one call to
1114     this function in the first pass, to check for a recursive back reference by
1115     name (so that we can make the whole group atomic). In this case, we need check
1116 ph10 579 only up to the current position in the pattern, and that is still OK because
1117     and previous occurrences will have been checked. To make this work, the test
1118     for "end of pattern" is a check against cd->end_pattern in the main loop,
1119 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1120 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1121     processing items within the loop are OK, because afterwards the main loop will
1122 ph10 578 terminate.)
1123    
1124 nigel 91 Arguments:
1125 ph10 408 ptrptr address of the current character pointer (updated)
1126 ph10 345 cd compile background data
1127 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1128     lorn name length, or subpattern number if name is NULL
1129     xmode TRUE if we are in /x mode
1130 ph10 579 utf8 TRUE if we are in UTF-8 mode
1131 ph10 411 count pointer to the current capturing subpattern number (updated)
1132 nigel 91
1133     Returns: the number of the named subpattern, or -1 if not found
1134     */
1135    
1136     static int
1137 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1138 ph10 556 BOOL xmode, BOOL utf8, int *count)
1139 nigel 91 {
1140 ph10 408 uschar *ptr = *ptrptr;
1141     int start_count = *count;
1142     int hwm_count = start_count;
1143     BOOL dup_parens = FALSE;
1144 nigel 93
1145 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1146 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1147    
1148     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1149     {
1150 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1151 ph10 545
1152 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1153 ph10 545
1154 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1155 ph10 408
1156 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1157 ph10 408 {
1158     *count += 1;
1159     if (name == NULL && *count == lorn) return *count;
1160 ph10 411 ptr++;
1161 ph10 408 }
1162    
1163 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1164     where the parenthesis numbers are duplicated. */
1165    
1166     else if (ptr[2] == CHAR_VERTICAL_LINE)
1167     {
1168     ptr += 3;
1169     dup_parens = TRUE;
1170     }
1171 ph10 545
1172 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1173    
1174     else if (ptr[2] == CHAR_NUMBER_SIGN)
1175     {
1176     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1177     goto FAIL_EXIT;
1178 ph10 545 }
1179 ph10 544
1180 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1181     is processed as normal. If not, skip to the closing parenthesis of the
1182 ph10 544 condition (there can't be any nested parens). */
1183 ph10 411
1184 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1185     {
1186 ph10 411 ptr += 2;
1187 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1188     {
1189     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1190 ph10 411 if (*ptr != 0) ptr++;
1191 ph10 408 }
1192 ph10 411 }
1193    
1194 ph10 544 /* Start with (? but not a condition. */
1195 ph10 408
1196     else
1197 ph10 411 {
1198 ph10 408 ptr += 2;
1199     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1200    
1201     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1202 ph10 411
1203 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1204     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1205     {
1206     int term;
1207     const uschar *thisname;
1208     *count += 1;
1209     if (name == NULL && *count == lorn) return *count;
1210     term = *ptr++;
1211     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1212     thisname = ptr;
1213     while (*ptr != term) ptr++;
1214     if (name != NULL && lorn == ptr - thisname &&
1215     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1216     return *count;
1217 ph10 461 term++;
1218 ph10 411 }
1219 ph10 408 }
1220 ph10 411 }
1221 ph10 408
1222 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1223 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1224     first-pass call when this value is temporarily adjusted to stop at the current
1225 ph10 578 position. So DO NOT change this to a test for binary zero. */
1226 ph10 408
1227 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1228 nigel 91 {
1229 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1230    
1231 ph10 391 if (*ptr == CHAR_BACKSLASH)
1232 nigel 93 {
1233 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1234 ph10 391 if (*ptr == CHAR_Q) for (;;)
1235 nigel 93 {
1236 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1237 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1238 ph10 391 if (*(++ptr) == CHAR_E) break;
1239 nigel 93 }
1240     continue;
1241     }
1242    
1243 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1244     are handled for real. If the first character is '^', skip it. Also, if the
1245     first few characters (either before or after ^) are \Q\E or \E we skip them
1246 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1247 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1248 nigel 93
1249 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1250 nigel 93 {
1251 ph10 340 BOOL negate_class = FALSE;
1252     for (;;)
1253     {
1254 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1255 ph10 340 {
1256 ph10 438 if (ptr[2] == CHAR_E)
1257     ptr+= 2;
1258     else if (strncmp((const char *)ptr+2,
1259 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1260 ph10 438 ptr += 4;
1261 ph10 392 else
1262 ph10 391 break;
1263 ph10 340 }
1264 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1265 ph10 461 {
1266 ph10 340 negate_class = TRUE;
1267 ph10 438 ptr++;
1268 ph10 461 }
1269 ph10 340 else break;
1270     }
1271    
1272     /* If the next character is ']', it is a data character that must be
1273 ph10 341 skipped, except in JavaScript compatibility mode. */
1274 ph10 345
1275 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1276 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1277 ph10 345 ptr++;
1278    
1279 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1280 nigel 93 {
1281 ph10 220 if (*ptr == 0) return -1;
1282 ph10 391 if (*ptr == CHAR_BACKSLASH)
1283 nigel 93 {
1284 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1285 ph10 391 if (*ptr == CHAR_Q) for (;;)
1286 nigel 93 {
1287 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1288 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1289 ph10 391 if (*(++ptr) == CHAR_E) break;
1290 nigel 93 }
1291     continue;
1292     }
1293     }
1294     continue;
1295     }
1296    
1297     /* Skip comments in /x mode */
1298    
1299 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1300 nigel 93 {
1301 ph10 579 ptr++;
1302 ph10 556 while (*ptr != 0)
1303     {
1304     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1305     ptr++;
1306 ph10 579 #ifdef SUPPORT_UTF8
1307 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1308     #endif
1309     }
1310 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1311 nigel 93 continue;
1312     }
1313    
1314 ph10 408 /* Check for the special metacharacters */
1315 ph10 411
1316 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1317 nigel 93 {
1318 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1319 ph10 408 if (rc > 0) return rc;
1320     if (*ptr == 0) goto FAIL_EXIT;
1321 nigel 93 }
1322 ph10 411
1323 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1324     {
1325     if (dup_parens && *count < hwm_count) *count = hwm_count;
1326 ph10 545 goto FAIL_EXIT;
1327 ph10 408 }
1328 ph10 411
1329     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1330 ph10 408 {
1331     if (*count > hwm_count) hwm_count = *count;
1332     *count = start_count;
1333 ph10 411 }
1334 ph10 408 }
1335 nigel 93
1336 ph10 408 FAIL_EXIT:
1337     *ptrptr = ptr;
1338     return -1;
1339     }
1340 nigel 93
1341    
1342    
1343    
1344 ph10 408 /*************************************************
1345     * Find forward referenced subpattern *
1346     *************************************************/
1347 nigel 93
1348 ph10 408 /* This function scans along a pattern's text looking for capturing
1349     subpatterns, and counting them. If it finds a named pattern that matches the
1350     name it is given, it returns its number. Alternatively, if the name is NULL, it
1351     returns when it reaches a given numbered subpattern. This is used for forward
1352     references to subpatterns. We used to be able to start this scan from the
1353     current compiling point, using the current count value from cd->bracount, and
1354     do it all in a single loop, but the addition of the possibility of duplicate
1355     subpattern numbers means that we have to scan from the very start, in order to
1356     take account of such duplicates, and to use a recursive function to keep track
1357     of the different types of group.
1358    
1359     Arguments:
1360     cd compile background data
1361     name name to seek, or NULL if seeking a numbered subpattern
1362     lorn name length, or subpattern number if name is NULL
1363     xmode TRUE if we are in /x mode
1364 ph10 579 utf8 TRUE if we are in UTF-8 mode
1365 ph10 408
1366     Returns: the number of the found subpattern, or -1 if not found
1367     */
1368    
1369     static int
1370 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1371     BOOL utf8)
1372 ph10 408 {
1373     uschar *ptr = (uschar *)cd->start_pattern;
1374     int count = 0;
1375     int rc;
1376    
1377     /* If the pattern does not start with an opening parenthesis, the first call
1378     to find_parens_sub() will scan right to the end (if necessary). However, if it
1379     does start with a parenthesis, find_parens_sub() will return when it hits the
1380     matching closing parens. That is why we have to have a loop. */
1381    
1382 ph10 411 for (;;)
1383     {
1384 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1385 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1386     }
1387    
1388 ph10 408 return rc;
1389 nigel 91 }
1390    
1391    
1392    
1393 ph10 408
1394 nigel 91 /*************************************************
1395 nigel 77 * Find first significant op code *
1396     *************************************************/
1397    
1398     /* This is called by several functions that scan a compiled expression looking
1399     for a fixed first character, or an anchoring op code etc. It skips over things
1400 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1401     forward and all backward assertions, and also the \b assertion; for others it
1402     does not.
1403 nigel 77
1404     Arguments:
1405     code pointer to the start of the group
1406     skipassert TRUE if certain assertions are to be skipped
1407    
1408     Returns: pointer to the first significant opcode
1409     */
1410    
1411     static const uschar*
1412 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1413 nigel 77 {
1414     for (;;)
1415     {
1416     switch ((int)*code)
1417     {
1418     case OP_ASSERT_NOT:
1419     case OP_ASSERTBACK:
1420     case OP_ASSERTBACK_NOT:
1421     if (!skipassert) return code;
1422     do code += GET(code, 1); while (*code == OP_ALT);
1423     code += _pcre_OP_lengths[*code];
1424     break;
1425    
1426     case OP_WORD_BOUNDARY:
1427     case OP_NOT_WORD_BOUNDARY:
1428     if (!skipassert) return code;
1429     /* Fall through */
1430    
1431     case OP_CALLOUT:
1432     case OP_CREF:
1433 ph10 459 case OP_NCREF:
1434 nigel 93 case OP_RREF:
1435 ph10 459 case OP_NRREF:
1436 nigel 93 case OP_DEF:
1437 nigel 77 code += _pcre_OP_lengths[*code];
1438     break;
1439    
1440     default:
1441     return code;
1442     }
1443     }
1444     /* Control never reaches here */
1445     }
1446    
1447    
1448    
1449    
1450     /*************************************************
1451 ph10 454 * Find the fixed length of a branch *
1452 nigel 77 *************************************************/
1453    
1454 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1455 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1456 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1457 ph10 454 temporarily terminated with OP_END when this function is called.
1458 nigel 77
1459 ph10 461 This function is called when a backward assertion is encountered, so that if it
1460     fails, the error message can point to the correct place in the pattern.
1461 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1462 ph10 461 because they can be forward references. We solve this by remembering this case
1463 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1464    
1465 nigel 77 Arguments:
1466     code points to the start of the pattern (the bracket)
1467 ph10 604 utf8 TRUE in UTF-8 mode
1468 ph10 461 atend TRUE if called when the pattern is complete
1469     cd the "compile data" structure
1470 nigel 77
1471 ph10 461 Returns: the fixed length,
1472 ph10 454 or -1 if there is no fixed length,
1473 nigel 77 or -2 if \C was encountered
1474 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1475 nigel 77 */
1476    
1477     static int
1478 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1479 nigel 77 {
1480     int length = -1;
1481    
1482     register int branchlength = 0;
1483     register uschar *cc = code + 1 + LINK_SIZE;
1484    
1485     /* Scan along the opcodes for this branch. If we get to the end of the
1486     branch, check the length against that of the other branches. */
1487    
1488     for (;;)
1489     {
1490     int d;
1491 ph10 454 uschar *ce, *cs;
1492 nigel 77 register int op = *cc;
1493     switch (op)
1494     {
1495 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1496     OP_BRA (normal non-capturing bracket) because the other variants of these
1497     opcodes are all concerned with unlimited repeated groups, which of course
1498     are not of fixed length. They will cause a -1 response from the default
1499     case of this switch. */
1500    
1501 nigel 93 case OP_CBRA:
1502 nigel 77 case OP_BRA:
1503     case OP_ONCE:
1504     case OP_COND:
1505 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1506 nigel 77 if (d < 0) return d;
1507     branchlength += d;
1508     do cc += GET(cc, 1); while (*cc == OP_ALT);
1509     cc += 1 + LINK_SIZE;
1510     break;
1511    
1512     /* Reached end of a branch; if it's a ket it is the end of a nested
1513     call. If it's ALT it is an alternation in a nested call. If it is
1514 ph10 604 END it's the end of the outer call. All can be handled by the same code.
1515     Note that we must not include the OP_KETRxxx opcodes here, because they
1516     all imply an unlimited repeat. */
1517 nigel 77
1518     case OP_ALT:
1519     case OP_KET:
1520     case OP_END:
1521     if (length < 0) length = branchlength;
1522     else if (length != branchlength) return -1;
1523     if (*cc != OP_ALT) return length;
1524     cc += 1 + LINK_SIZE;
1525     branchlength = 0;
1526     break;
1527 ph10 461
1528 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1529     be OK. If the subroutine is a forward reference, we can't deal with
1530     it until the end of the pattern, so return -3. */
1531 ph10 461
1532 ph10 454 case OP_RECURSE:
1533     if (!atend) return -3;
1534     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1535     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1536     if (cc > cs && cc < ce) return -1; /* Recursion */
1537 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1538 ph10 461 if (d < 0) return d;
1539 ph10 454 branchlength += d;
1540     cc += 1 + LINK_SIZE;
1541 ph10 461 break;
1542 nigel 77
1543     /* Skip over assertive subpatterns */
1544    
1545     case OP_ASSERT:
1546     case OP_ASSERT_NOT:
1547     case OP_ASSERTBACK:
1548     case OP_ASSERTBACK_NOT:
1549     do cc += GET(cc, 1); while (*cc == OP_ALT);
1550     /* Fall through */
1551    
1552     /* Skip over things that don't match chars */
1553    
1554     case OP_REVERSE:
1555     case OP_CREF:
1556 ph10 459 case OP_NCREF:
1557 nigel 93 case OP_RREF:
1558 ph10 459 case OP_NRREF:
1559 nigel 93 case OP_DEF:
1560 nigel 77 case OP_CALLOUT:
1561     case OP_SOD:
1562     case OP_SOM:
1563 ph10 500 case OP_SET_SOM:
1564 nigel 77 case OP_EOD:
1565     case OP_EODN:
1566     case OP_CIRC:
1567 ph10 602 case OP_CIRCM:
1568 nigel 77 case OP_DOLL:
1569 ph10 602 case OP_DOLLM:
1570 nigel 77 case OP_NOT_WORD_BOUNDARY:
1571     case OP_WORD_BOUNDARY:
1572     cc += _pcre_OP_lengths[*cc];
1573     break;
1574    
1575     /* Handle literal characters */
1576    
1577     case OP_CHAR:
1578 ph10 602 case OP_CHARI:
1579 nigel 91 case OP_NOT:
1580 ph10 604 case OP_NOTI:
1581 nigel 77 branchlength++;
1582     cc += 2;
1583     #ifdef SUPPORT_UTF8
1584 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1585 nigel 77 #endif
1586     break;
1587    
1588     /* Handle exact repetitions. The count is already in characters, but we
1589     need to skip over a multibyte character in UTF8 mode. */
1590    
1591     case OP_EXACT:
1592     branchlength += GET2(cc,1);
1593     cc += 4;
1594     #ifdef SUPPORT_UTF8
1595 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1596 nigel 77 #endif
1597     break;
1598    
1599     case OP_TYPEEXACT:
1600     branchlength += GET2(cc,1);
1601 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1602 nigel 77 cc += 4;
1603     break;
1604    
1605     /* Handle single-char matchers */
1606    
1607     case OP_PROP:
1608     case OP_NOTPROP:
1609 nigel 87 cc += 2;
1610 nigel 77 /* Fall through */
1611    
1612     case OP_NOT_DIGIT:
1613     case OP_DIGIT:
1614     case OP_NOT_WHITESPACE:
1615     case OP_WHITESPACE:
1616     case OP_NOT_WORDCHAR:
1617     case OP_WORDCHAR:
1618     case OP_ANY:
1619 ph10 342 case OP_ALLANY:
1620 nigel 77 branchlength++;
1621     cc++;
1622     break;
1623    
1624     /* The single-byte matcher isn't allowed */
1625    
1626     case OP_ANYBYTE:
1627     return -2;
1628    
1629     /* Check a class for variable quantification */
1630    
1631     #ifdef SUPPORT_UTF8
1632     case OP_XCLASS:
1633     cc += GET(cc, 1) - 33;
1634     /* Fall through */
1635     #endif
1636    
1637     case OP_CLASS:
1638     case OP_NCLASS:
1639     cc += 33;
1640    
1641     switch (*cc)
1642     {
1643     case OP_CRSTAR:
1644     case OP_CRMINSTAR:
1645     case OP_CRQUERY:
1646     case OP_CRMINQUERY:
1647     return -1;
1648    
1649     case OP_CRRANGE:
1650     case OP_CRMINRANGE:
1651     if (GET2(cc,1) != GET2(cc,3)) return -1;
1652     branchlength += GET2(cc,1);
1653     cc += 5;
1654     break;
1655    
1656     default:
1657     branchlength++;
1658     }
1659     break;
1660    
1661     /* Anything else is variable length */
1662    
1663     default:
1664     return -1;
1665     }
1666     }
1667     /* Control never gets here */
1668     }
1669    
1670    
1671    
1672    
1673     /*************************************************
1674 ph10 454 * Scan compiled regex for specific bracket *
1675 nigel 77 *************************************************/
1676    
1677     /* This little function scans through a compiled pattern until it finds a
1678 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1679 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1680     so that it can be called from pcre_study() when finding the minimum matching
1681 ph10 455 length.
1682 nigel 77
1683     Arguments:
1684     code points to start of expression
1685     utf8 TRUE in UTF-8 mode
1686 ph10 454 number the required bracket number or negative to find a lookbehind
1687 nigel 77
1688     Returns: pointer to the opcode for the bracket, or NULL if not found
1689     */
1690    
1691 ph10 455 const uschar *
1692     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1693 nigel 77 {
1694     for (;;)
1695     {
1696     register int c = *code;
1697     if (c == OP_END) return NULL;
1698 nigel 91
1699     /* XCLASS is used for classes that cannot be represented just by a bit
1700     map. This includes negated single high-valued characters. The length in
1701     the table is zero; the actual length is stored in the compiled code. */
1702    
1703     if (c == OP_XCLASS) code += GET(code, 1);
1704 ph10 461
1705 ph10 454 /* Handle recursion */
1706 ph10 461
1707 ph10 454 else if (c == OP_REVERSE)
1708     {
1709 ph10 461 if (number < 0) return (uschar *)code;
1710 ph10 454 code += _pcre_OP_lengths[c];
1711     }
1712 nigel 91
1713 nigel 93 /* Handle capturing bracket */
1714 nigel 91
1715 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1716     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1717 nigel 77 {
1718 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1719 nigel 77 if (n == number) return (uschar *)code;
1720 nigel 93 code += _pcre_OP_lengths[c];
1721 nigel 77 }
1722 nigel 91
1723 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1724     repeated character types, we have to test for \p and \P, which have an extra
1725 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1726 ph10 510 must add in its length. */
1727 nigel 91
1728 nigel 77 else
1729     {
1730 ph10 218 switch(c)
1731     {
1732     case OP_TYPESTAR:
1733     case OP_TYPEMINSTAR:
1734     case OP_TYPEPLUS:
1735     case OP_TYPEMINPLUS:
1736     case OP_TYPEQUERY:
1737     case OP_TYPEMINQUERY:
1738     case OP_TYPEPOSSTAR:
1739     case OP_TYPEPOSPLUS:
1740     case OP_TYPEPOSQUERY:
1741     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1742 ph10 220 break;
1743 ph10 221
1744     case OP_TYPEUPTO:
1745     case OP_TYPEMINUPTO:
1746     case OP_TYPEEXACT:
1747     case OP_TYPEPOSUPTO:
1748     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1749     break;
1750 ph10 512
1751 ph10 510 case OP_MARK:
1752     case OP_PRUNE_ARG:
1753     case OP_SKIP_ARG:
1754     code += code[1];
1755 ph10 512 break;
1756 ph10 550
1757     case OP_THEN_ARG:
1758     code += code[1+LINK_SIZE];
1759     break;
1760 ph10 220 }
1761    
1762 ph10 218 /* Add in the fixed length from the table */
1763 ph10 220
1764 nigel 77 code += _pcre_OP_lengths[c];
1765 ph10 220
1766 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1767     a multi-byte character. The length in the table is a minimum, so we have to
1768     arrange to skip the extra bytes. */
1769 ph10 220
1770 ph10 107 #ifdef SUPPORT_UTF8
1771 nigel 77 if (utf8) switch(c)
1772     {
1773     case OP_CHAR:
1774 ph10 602 case OP_CHARI:
1775 nigel 77 case OP_EXACT:
1776 ph10 602 case OP_EXACTI:
1777 nigel 77 case OP_UPTO:
1778 ph10 602 case OP_UPTOI:
1779 nigel 77 case OP_MINUPTO:
1780 ph10 602 case OP_MINUPTOI:
1781 nigel 93 case OP_POSUPTO:
1782 ph10 602 case OP_POSUPTOI:
1783 nigel 77 case OP_STAR:
1784 ph10 602 case OP_STARI:
1785 nigel 77 case OP_MINSTAR:
1786 ph10 602 case OP_MINSTARI:
1787 nigel 93 case OP_POSSTAR:
1788 ph10 602 case OP_POSSTARI:
1789 nigel 77 case OP_PLUS:
1790 ph10 602 case OP_PLUSI:
1791 nigel 77 case OP_MINPLUS:
1792 ph10 602 case OP_MINPLUSI:
1793 nigel 93 case OP_POSPLUS:
1794 ph10 602 case OP_POSPLUSI:
1795 nigel 77 case OP_QUERY:
1796 ph10 602 case OP_QUERYI:
1797 nigel 77 case OP_MINQUERY:
1798 ph10 602 case OP_MINQUERYI:
1799 nigel 93 case OP_POSQUERY:
1800 ph10 602 case OP_POSQUERYI:
1801 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1802 nigel 77 break;
1803     }
1804 ph10 369 #else
1805     (void)(utf8); /* Keep compiler happy by referencing function argument */
1806 ph10 111 #endif
1807 nigel 77 }
1808     }
1809     }
1810    
1811    
1812    
1813     /*************************************************
1814     * Scan compiled regex for recursion reference *
1815     *************************************************/
1816    
1817     /* This little function scans through a compiled pattern until it finds an
1818     instance of OP_RECURSE.
1819    
1820     Arguments:
1821     code points to start of expression
1822     utf8 TRUE in UTF-8 mode
1823    
1824     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1825     */
1826    
1827     static const uschar *
1828     find_recurse(const uschar *code, BOOL utf8)
1829     {
1830     for (;;)
1831     {
1832     register int c = *code;
1833     if (c == OP_END) return NULL;
1834 nigel 91 if (c == OP_RECURSE) return code;
1835 ph10 220
1836 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1837     map. This includes negated single high-valued characters. The length in
1838     the table is zero; the actual length is stored in the compiled code. */
1839    
1840     if (c == OP_XCLASS) code += GET(code, 1);
1841    
1842 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1843     repeated character types, we have to test for \p and \P, which have an extra
1844 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1845 ph10 510 must add in its length. */
1846 nigel 91
1847 nigel 77 else
1848     {
1849 ph10 218 switch(c)
1850     {
1851     case OP_TYPESTAR:
1852     case OP_TYPEMINSTAR:
1853     case OP_TYPEPLUS:
1854     case OP_TYPEMINPLUS:
1855     case OP_TYPEQUERY:
1856     case OP_TYPEMINQUERY:
1857     case OP_TYPEPOSSTAR:
1858     case OP_TYPEPOSPLUS:
1859     case OP_TYPEPOSQUERY:
1860     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1861 ph10 220 break;
1862 ph10 221
1863     case OP_TYPEPOSUPTO:
1864     case OP_TYPEUPTO:
1865     case OP_TYPEMINUPTO:
1866     case OP_TYPEEXACT:
1867     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1868     break;
1869 ph10 512
1870 ph10 510 case OP_MARK:
1871     case OP_PRUNE_ARG:
1872     case OP_SKIP_ARG:
1873     code += code[1];
1874 ph10 512 break;
1875 ph10 550
1876     case OP_THEN_ARG:
1877     code += code[1+LINK_SIZE];
1878     break;
1879 ph10 220 }
1880    
1881 ph10 218 /* Add in the fixed length from the table */
1882    
1883 nigel 77 code += _pcre_OP_lengths[c];
1884 ph10 220
1885 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1886     by a multi-byte character. The length in the table is a minimum, so we have
1887     to arrange to skip the extra bytes. */
1888 ph10 220
1889 ph10 107 #ifdef SUPPORT_UTF8
1890 nigel 77 if (utf8) switch(c)
1891     {
1892     case OP_CHAR:
1893 ph10 602 case OP_CHARI:
1894 nigel 77 case OP_EXACT:
1895 ph10 602 case OP_EXACTI:
1896 nigel 77 case OP_UPTO:
1897 ph10 602 case OP_UPTOI:
1898 nigel 77 case OP_MINUPTO:
1899 ph10 602 case OP_MINUPTOI:
1900 nigel 93 case OP_POSUPTO:
1901 ph10 602 case OP_POSUPTOI:
1902 nigel 77 case OP_STAR:
1903 ph10 602 case OP_STARI:
1904 nigel 77 case OP_MINSTAR:
1905 ph10 602 case OP_MINSTARI:
1906 nigel 93 case OP_POSSTAR:
1907 ph10 602 case OP_POSSTARI:
1908 nigel 77 case OP_PLUS:
1909 ph10 602 case OP_PLUSI:
1910 nigel 77 case OP_MINPLUS:
1911 ph10 602 case OP_MINPLUSI:
1912 nigel 93 case OP_POSPLUS:
1913 ph10 602 case OP_POSPLUSI:
1914 nigel 77 case OP_QUERY:
1915 ph10 602 case OP_QUERYI:
1916 nigel 77 case OP_MINQUERY:
1917 ph10 602 case OP_MINQUERYI:
1918 nigel 93 case OP_POSQUERY:
1919 ph10 602 case OP_POSQUERYI:
1920 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1921 nigel 77 break;
1922     }
1923 ph10 369 #else
1924     (void)(utf8); /* Keep compiler happy by referencing function argument */
1925 ph10 111 #endif
1926 nigel 77 }
1927     }
1928     }
1929    
1930    
1931    
1932     /*************************************************
1933     * Scan compiled branch for non-emptiness *
1934     *************************************************/
1935    
1936     /* This function scans through a branch of a compiled pattern to see whether it
1937 nigel 93 can match the empty string or not. It is called from could_be_empty()
1938     below and from compile_branch() when checking for an unlimited repeat of a
1939     group that can match nothing. Note that first_significant_code() skips over
1940 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1941     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1942     bracket whose current branch will already have been scanned.
1943 nigel 77
1944     Arguments:
1945     code points to start of search
1946     endcode points to where to stop
1947     utf8 TRUE if in UTF8 mode
1948 ph10 503 cd contains pointers to tables etc.
1949 nigel 77
1950     Returns: TRUE if what is matched could be empty
1951     */
1952    
1953     static BOOL
1954 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1955     compile_data *cd)
1956 nigel 77 {
1957     register int c;
1958 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1959 nigel 77 code < endcode;
1960 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1961 nigel 77 {
1962     const uschar *ccode;
1963    
1964     c = *code;
1965 ph10 507
1966 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1967 ph10 282 first_significant_code() with a TRUE final argument. */
1968 ph10 286
1969 ph10 282 if (c == OP_ASSERT)
1970 ph10 286 {
1971 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1972     c = *code;
1973     continue;
1974 ph10 286 }
1975 ph10 172
1976 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1977     implies a subroutine call, we can scan it. */
1978 ph10 507
1979 ph10 503 if (c == OP_RECURSE)
1980     {
1981 ph10 507 BOOL empty_branch = FALSE;
1982 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1983     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1984     do
1985     {
1986 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1987     {
1988     empty_branch = TRUE;
1989 ph10 507 break;
1990     }
1991 ph10 503 scode += GET(scode, 1);
1992     }
1993     while (*scode == OP_ALT);
1994 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1995 ph10 503 continue;
1996 ph10 507 }
1997 ph10 170
1998 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
1999    
2000     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2001     c == OP_BRAPOSZERO)
2002     {
2003     code += _pcre_OP_lengths[c];
2004     do code += GET(code, 1); while (*code == OP_ALT);
2005     c = *code;
2006     continue;
2007     }
2008    
2009     /* A nested group that is already marked as "could be empty" can just be
2010     skipped. */
2011    
2012     if (c == OP_SBRA || c == OP_SBRAPOS ||
2013     c == OP_SCBRA || c == OP_SCBRAPOS)
2014     {
2015     do code += GET(code, 1); while (*code == OP_ALT);
2016     c = *code;
2017     continue;
2018     }
2019    
2020 ph10 170 /* For other groups, scan the branches. */
2021 ph10 172
2022 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2023     c == OP_CBRA || c == OP_CBRAPOS ||
2024     c == OP_ONCE || c == OP_COND)
2025 nigel 77 {
2026     BOOL empty_branch;
2027     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2028 ph10 406
2029     /* If a conditional group has only one branch, there is a second, implied,
2030 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2031     Otherwise, scan the individual branches of the group. */
2032 ph10 406
2033 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2034 nigel 77 code += GET(code, 1);
2035 ph10 395 else
2036 ph10 406 {
2037 ph10 395 empty_branch = FALSE;
2038     do
2039     {
2040 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2041 ph10 395 empty_branch = TRUE;
2042     code += GET(code, 1);
2043     }
2044     while (*code == OP_ALT);
2045     if (!empty_branch) return FALSE; /* All branches are non-empty */
2046 nigel 77 }
2047 ph10 406
2048 ph10 172 c = *code;
2049 nigel 93 continue;
2050 nigel 77 }
2051    
2052 nigel 93 /* Handle the other opcodes */
2053    
2054     switch (c)
2055 nigel 77 {
2056 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2057     cannot be represented just by a bit map. This includes negated single
2058     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2059 ph10 220 actual length is stored in the compiled code, so we must update "code"
2060 ph10 216 here. */
2061 nigel 77
2062     #ifdef SUPPORT_UTF8
2063     case OP_XCLASS:
2064 ph10 216 ccode = code += GET(code, 1);
2065 nigel 77 goto CHECK_CLASS_REPEAT;
2066     #endif
2067    
2068     case OP_CLASS:
2069     case OP_NCLASS:
2070     ccode = code + 33;
2071    
2072     #ifdef SUPPORT_UTF8
2073     CHECK_CLASS_REPEAT:
2074     #endif
2075    
2076     switch (*ccode)
2077     {
2078     case OP_CRSTAR: /* These could be empty; continue */
2079     case OP_CRMINSTAR:
2080     case OP_CRQUERY:
2081     case OP_CRMINQUERY:
2082     break;
2083    
2084     default: /* Non-repeat => class must match */
2085     case OP_CRPLUS: /* These repeats aren't empty */
2086     case OP_CRMINPLUS:
2087     return FALSE;
2088    
2089     case OP_CRRANGE:
2090     case OP_CRMINRANGE:
2091     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2092     break;
2093     }
2094     break;
2095    
2096     /* Opcodes that must match a character */
2097    
2098     case OP_PROP:
2099     case OP_NOTPROP:
2100     case OP_EXTUNI:
2101     case OP_NOT_DIGIT:
2102     case OP_DIGIT:
2103     case OP_NOT_WHITESPACE:
2104     case OP_WHITESPACE:
2105     case OP_NOT_WORDCHAR:
2106     case OP_WORDCHAR:
2107     case OP_ANY:
2108 ph10 345 case OP_ALLANY:
2109 nigel 77 case OP_ANYBYTE:
2110     case OP_CHAR:
2111 ph10 602 case OP_CHARI:
2112 nigel 77 case OP_NOT:
2113 ph10 602 case OP_NOTI:
2114 nigel 77 case OP_PLUS:
2115     case OP_MINPLUS:
2116 nigel 93 case OP_POSPLUS:
2117 nigel 77 case OP_EXACT:
2118     case OP_NOTPLUS:
2119     case OP_NOTMINPLUS:
2120 nigel 93 case OP_NOTPOSPLUS:
2121 nigel 77 case OP_NOTEXACT:
2122     case OP_TYPEPLUS:
2123     case OP_TYPEMINPLUS:
2124 nigel 93 case OP_TYPEPOSPLUS:
2125 nigel 77 case OP_TYPEEXACT:
2126     return FALSE;
2127 ph10 227
2128     /* These are going to continue, as they may be empty, but we have to
2129     fudge the length for the \p and \P cases. */
2130    
2131 ph10 224 case OP_TYPESTAR:
2132     case OP_TYPEMINSTAR:
2133     case OP_TYPEPOSSTAR:
2134     case OP_TYPEQUERY:
2135     case OP_TYPEMINQUERY:
2136     case OP_TYPEPOSQUERY:
2137     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2138 ph10 227 break;
2139    
2140 ph10 224 /* Same for these */
2141 ph10 227
2142 ph10 224 case OP_TYPEUPTO:
2143     case OP_TYPEMINUPTO:
2144     case OP_TYPEPOSUPTO:
2145     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2146     break;
2147 nigel 77
2148     /* End of branch */
2149    
2150     case OP_KET:
2151     case OP_KETRMAX:
2152     case OP_KETRMIN:
2153 ph10 604 case OP_KETRPOS:
2154 nigel 77 case OP_ALT:
2155     return TRUE;
2156    
2157 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2158     MINUPTO, and POSUPTO may be followed by a multibyte character */
2159 nigel 77
2160     #ifdef SUPPORT_UTF8
2161     case OP_STAR:
2162 ph10 602 case OP_STARI:
2163 nigel 77 case OP_MINSTAR:
2164 ph10 602 case OP_MINSTARI:
2165 nigel 93 case OP_POSSTAR:
2166 ph10 602 case OP_POSSTARI:
2167 nigel 77 case OP_QUERY:
2168 ph10 602 case OP_QUERYI:
2169 nigel 77 case OP_MINQUERY:
2170 ph10 602 case OP_MINQUERYI:
2171 nigel 93 case OP_POSQUERY:
2172 ph10 602 case OP_POSQUERYI:
2173 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2174     break;
2175 ph10 461
2176 nigel 77 case OP_UPTO:
2177 ph10 602 case OP_UPTOI:
2178 nigel 77 case OP_MINUPTO:
2179 ph10 602 case OP_MINUPTOI:
2180 nigel 93 case OP_POSUPTO:
2181 ph10 602 case OP_POSUPTOI:
2182 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2183 nigel 77 break;
2184     #endif
2185 ph10 503
2186 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2187     string. */
2188    
2189     case OP_MARK:
2190     case OP_PRUNE_ARG:
2191     case OP_SKIP_ARG:
2192     code += code[1];
2193 ph10 512 break;
2194 ph10 510
2195 ph10 550 case OP_THEN_ARG:
2196     code += code[1+LINK_SIZE];
2197     break;
2198    
2199 ph10 503 /* None of the remaining opcodes are required to match a character. */
2200 ph10 507
2201 ph10 503 default:
2202 ph10 507 break;
2203 nigel 77 }
2204     }
2205    
2206     return TRUE;
2207     }
2208    
2209    
2210    
2211     /*************************************************
2212     * Scan compiled regex for non-emptiness *
2213     *************************************************/
2214    
2215     /* This function is called to check for left recursive calls. We want to check
2216     the current branch of the current pattern to see if it could match the empty
2217     string. If it could, we must look outwards for branches at other levels,
2218     stopping when we pass beyond the bracket which is the subject of the recursion.
2219    
2220     Arguments:
2221     code points to start of the recursion
2222     endcode points to where to stop (current RECURSE item)
2223     bcptr points to the chain of current (unclosed) branch starts
2224     utf8 TRUE if in UTF-8 mode
2225 ph10 507 cd pointers to tables etc
2226 nigel 77
2227     Returns: TRUE if what is matched could be empty
2228     */
2229    
2230     static BOOL
2231     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2232 ph10 503 BOOL utf8, compile_data *cd)
2233 nigel 77 {
2234 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2235 nigel 77 {
2236 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2237 ph10 475 return FALSE;
2238 nigel 77 bcptr = bcptr->outer;
2239     }
2240     return TRUE;
2241     }
2242    
2243    
2244    
2245     /*************************************************
2246     * Check for POSIX class syntax *
2247     *************************************************/
2248    
2249     /* This function is called when the sequence "[:" or "[." or "[=" is
2250 ph10 295 encountered in a character class. It checks whether this is followed by a
2251 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2252 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2253 nigel 77
2254 ph10 298 Originally, this function only recognized a sequence of letters between the
2255     terminators, but it seems that Perl recognizes any sequence of characters,
2256     though of course unknown POSIX names are subsequently rejected. Perl gives an
2257     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2258     didn't consider this to be a POSIX class. Likewise for [:1234:].
2259 ph10 295
2260 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2261     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2262     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2263     below handles the special case of \], but does not try to do any other escape
2264     processing. This makes it different from Perl for cases such as [:l\ower:]
2265 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2266 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2267 ph10 295 I think.
2268    
2269     Arguments:
2270 nigel 77 ptr pointer to the initial [
2271     endptr where to return the end pointer
2272    
2273     Returns: TRUE or FALSE
2274     */
2275    
2276     static BOOL
2277 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2278 nigel 77 {
2279     int terminator; /* Don't combine these lines; the Solaris cc */
2280     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2281 ph10 295 for (++ptr; *ptr != 0; ptr++)
2282 nigel 77 {
2283 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2284 ph10 298 {
2285 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2286     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2287 ph10 295 {
2288     *endptr = ptr;
2289     return TRUE;
2290 ph10 298 }
2291     }
2292     }
2293 nigel 77 return FALSE;
2294     }
2295    
2296    
2297    
2298    
2299     /*************************************************
2300     * Check POSIX class name *
2301     *************************************************/
2302    
2303     /* This function is called to check the name given in a POSIX-style class entry
2304     such as [:alnum:].
2305    
2306     Arguments:
2307     ptr points to the first letter
2308     len the length of the name
2309    
2310     Returns: a value representing the name, or -1 if unknown
2311     */
2312    
2313     static int
2314     check_posix_name(const uschar *ptr, int len)
2315     {
2316 ph10 240 const char *pn = posix_names;
2317 nigel 77 register int yield = 0;
2318     while (posix_name_lengths[yield] != 0)
2319     {
2320     if (len == posix_name_lengths[yield] &&
2321 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2322 ph10 243 pn += posix_name_lengths[yield] + 1;
2323 nigel 77 yield++;
2324     }
2325     return -1;
2326     }
2327    
2328    
2329     /*************************************************
2330     * Adjust OP_RECURSE items in repeated group *
2331     *************************************************/
2332    
2333     /* OP_RECURSE items contain an offset from the start of the regex to the group
2334     that is referenced. This means that groups can be replicated for fixed
2335     repetition simply by copying (because the recursion is allowed to refer to
2336     earlier groups that are outside the current group). However, when a group is
2337 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2338     inserted before it, after it has been compiled. This means that any OP_RECURSE
2339     items within it that refer to the group itself or any contained groups have to
2340     have their offsets adjusted. That one of the jobs of this function. Before it
2341     is called, the partially compiled regex must be temporarily terminated with
2342     OP_END.
2343 nigel 77
2344 nigel 93 This function has been extended with the possibility of forward references for
2345     recursions and subroutine calls. It must also check the list of such references
2346     for the group we are dealing with. If it finds that one of the recursions in
2347     the current group is on this list, it adjusts the offset in the list, not the
2348     value in the reference (which is a group number).
2349    
2350 nigel 77 Arguments:
2351     group points to the start of the group
2352     adjust the amount by which the group is to be moved
2353     utf8 TRUE in UTF-8 mode
2354     cd contains pointers to tables etc.
2355 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2356 nigel 77
2357     Returns: nothing
2358     */
2359    
2360     static void
2361 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2362     uschar *save_hwm)
2363 nigel 77 {
2364     uschar *ptr = group;
2365 ph10 224
2366 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2367     {
2368 nigel 93 int offset;
2369     uschar *hc;
2370    
2371     /* See if this recursion is on the forward reference list. If so, adjust the
2372     reference. */
2373 ph10 345
2374 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2375     {
2376     offset = GET(hc, 0);
2377     if (cd->start_code + offset == ptr + 1)
2378     {
2379     PUT(hc, 0, offset + adjust);
2380     break;
2381     }
2382     }
2383    
2384     /* Otherwise, adjust the recursion offset if it's after the start of this
2385     group. */
2386    
2387     if (hc >= cd->hwm)
2388     {
2389     offset = GET(ptr, 1);
2390     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2391     }
2392    
2393 nigel 77 ptr += 1 + LINK_SIZE;
2394     }
2395     }
2396    
2397    
2398    
2399     /*************************************************
2400     * Insert an automatic callout point *
2401     *************************************************/
2402    
2403     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2404     callout points before each pattern item.
2405    
2406     Arguments:
2407     code current code pointer
2408     ptr current pattern pointer
2409     cd pointers to tables etc
2410    
2411     Returns: new code pointer
2412     */
2413    
2414     static uschar *
2415     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2416     {
2417     *code++ = OP_CALLOUT;
2418     *code++ = 255;
2419 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2420     PUT(code, LINK_SIZE, 0); /* Default length */
2421 nigel 77 return code + 2*LINK_SIZE;
2422     }
2423    
2424    
2425    
2426     /*************************************************
2427     * Complete a callout item *
2428     *************************************************/
2429    
2430     /* A callout item contains the length of the next item in the pattern, which
2431     we can't fill in till after we have reached the relevant point. This is used
2432     for both automatic and manual callouts.
2433    
2434     Arguments:
2435     previous_callout points to previous callout item
2436     ptr current pattern pointer
2437     cd pointers to tables etc
2438    
2439     Returns: nothing
2440     */
2441    
2442     static void
2443     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2444     {
2445 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2446 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2447     }
2448    
2449    
2450    
2451     #ifdef SUPPORT_UCP
2452     /*************************************************
2453     * Get othercase range *
2454     *************************************************/
2455    
2456     /* This function is passed the start and end of a class range, in UTF-8 mode
2457     with UCP support. It searches up the characters, looking for internal ranges of
2458     characters in the "other" case. Each call returns the next one, updating the
2459     start address.
2460    
2461     Arguments:
2462     cptr points to starting character value; updated
2463     d end value
2464     ocptr where to put start of othercase range
2465     odptr where to put end of othercase range
2466    
2467     Yield: TRUE when range returned; FALSE when no more
2468     */
2469    
2470     static BOOL
2471 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2472     unsigned int *odptr)
2473 nigel 77 {
2474 nigel 93 unsigned int c, othercase, next;
2475 nigel 77
2476     for (c = *cptr; c <= d; c++)
2477 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2478 nigel 77
2479     if (c > d) return FALSE;
2480    
2481     *ocptr = othercase;
2482     next = othercase + 1;
2483    
2484     for (++c; c <= d; c++)
2485     {
2486 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2487 nigel 77 next++;
2488     }
2489    
2490     *odptr = next - 1;
2491     *cptr = c;
2492    
2493     return TRUE;
2494     }
2495 ph10 532
2496    
2497    
2498     /*************************************************
2499     * Check a character and a property *
2500     *************************************************/
2501    
2502     /* This function is called by check_auto_possessive() when a property item
2503     is adjacent to a fixed character.
2504    
2505     Arguments:
2506     c the character
2507     ptype the property type
2508     pdata the data for the type
2509     negated TRUE if it's a negated property (\P or \p{^)
2510 ph10 535
2511 ph10 532 Returns: TRUE if auto-possessifying is OK
2512 ph10 535 */
2513 ph10 532
2514     static BOOL
2515     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2516     {
2517     const ucd_record *prop = GET_UCD(c);
2518     switch(ptype)
2519     {
2520     case PT_LAMP:
2521     return (prop->chartype == ucp_Lu ||
2522     prop->chartype == ucp_Ll ||
2523     prop->chartype == ucp_Lt) == negated;
2524    
2525     case PT_GC:
2526     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2527    
2528     case PT_PC:
2529     return (pdata == prop->chartype) == negated;
2530    
2531     case PT_SC:
2532     return (pdata == prop->script) == negated;
2533    
2534     /* These are specials */
2535    
2536     case PT_ALNUM:
2537     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2538     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2539    
2540     case PT_SPACE: /* Perl space */
2541     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2542     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2543     == negated;
2544    
2545     case PT_PXSPACE: /* POSIX space */
2546     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2547     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2548     c == CHAR_FF || c == CHAR_CR)
2549     == negated;
2550    
2551     case PT_WORD:
2552     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2553     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2554     c == CHAR_UNDERSCORE) == negated;
2555     }
2556 ph10 535 return FALSE;
2557 ph10 532 }
2558 nigel 77 #endif /* SUPPORT_UCP */
2559    
2560    
2561 nigel 93
2562 nigel 77 /*************************************************
2563 nigel 93 * Check if auto-possessifying is possible *
2564     *************************************************/
2565    
2566     /* This function is called for unlimited repeats of certain items, to see
2567     whether the next thing could possibly match the repeated item. If not, it makes
2568     sense to automatically possessify the repeated item.
2569    
2570     Arguments:
2571 ph10 532 previous pointer to the repeated opcode
2572 nigel 93 utf8 TRUE in UTF-8 mode
2573     ptr next character in pattern
2574     options options bits
2575     cd contains pointers to tables etc.
2576    
2577     Returns: TRUE if possessifying is wanted
2578     */
2579    
2580     static BOOL
2581 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2582 ph10 532 int options, compile_data *cd)
2583 nigel 93 {
2584 ph10 532 int c, next;
2585     int op_code = *previous++;
2586 nigel 93
2587     /* Skip whitespace and comments in extended mode */
2588    
2589     if ((options & PCRE_EXTENDED) != 0)
2590     {
2591     for (;;)
2592     {
2593     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2594 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2595 nigel 93 {
2596 ph10 579 ptr++;
2597 ph10 556 while (*ptr != 0)
2598     {
2599 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2600 ph10 556 ptr++;
2601 ph10 579 #ifdef SUPPORT_UTF8
2602 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2603     #endif
2604     }
2605 nigel 93 }
2606     else break;
2607     }
2608     }
2609    
2610     /* If the next item is one that we can handle, get its value. A non-negative
2611     value is a character, a negative value is an escape value. */
2612    
2613 ph10 391 if (*ptr == CHAR_BACKSLASH)
2614 nigel 93 {
2615     int temperrorcode = 0;
2616     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2617     if (temperrorcode != 0) return FALSE;
2618     ptr++; /* Point after the escape sequence */
2619     }
2620    
2621     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2622     {
2623     #ifdef SUPPORT_UTF8
2624     if (utf8) { GETCHARINC(next, ptr); } else
2625     #endif
2626     next = *ptr++;
2627     }
2628    
2629     else return FALSE;
2630    
2631     /* Skip whitespace and comments in extended mode */
2632    
2633     if ((options & PCRE_EXTENDED) != 0)
2634     {
2635     for (;;)
2636     {
2637     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2638 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2639 nigel 93 {
2640 ph10 579 ptr++;
2641 ph10 556 while (*ptr != 0)
2642     {
2643 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2644 ph10 556 ptr++;
2645 ph10 579 #ifdef SUPPORT_UTF8
2646 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2647     #endif
2648     }
2649 nigel 93 }
2650     else break;
2651     }
2652     }
2653    
2654     /* If the next thing is itself optional, we have to give up. */
2655    
2656 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2657 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2658     return FALSE;
2659 nigel 93
2660 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2661     the next item is a character. */
2662 nigel 93
2663     if (next >= 0) switch(op_code)
2664     {
2665     case OP_CHAR:
2666 ph10 535 #ifdef SUPPORT_UTF8
2667 ph10 532 GETCHARTEST(c, previous);
2668 ph10 369 #else
2669 ph10 532 c = *previous;
2670 ph10 535 #endif
2671     return c != next;
2672 nigel 93
2673 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2674 nigel 93 Unicode property support, we can use it to test the other case of
2675     high-valued characters. */
2676    
2677 ph10 602 case OP_CHARI:
2678 ph10 535 #ifdef SUPPORT_UTF8
2679 ph10 532 GETCHARTEST(c, previous);
2680     #else
2681     c = *previous;
2682 ph10 535 #endif
2683 ph10 532 if (c == next) return FALSE;
2684 nigel 93 #ifdef SUPPORT_UTF8
2685     if (utf8)
2686     {
2687     unsigned int othercase;
2688     if (next < 128) othercase = cd->fcc[next]; else
2689     #ifdef SUPPORT_UCP
2690 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2691 nigel 93 #else
2692     othercase = NOTACHAR;
2693     #endif
2694 ph10 532 return (unsigned int)c != othercase;
2695 nigel 93 }
2696     else
2697     #endif /* SUPPORT_UTF8 */
2698 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2699 nigel 93
2700 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2701 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2702 ph10 602 an XCLASS instead. */
2703 nigel 93
2704     case OP_NOT:
2705 ph10 602 return (c = *previous) == next;
2706 ph10 604
2707     case OP_NOTI:
2708 ph10 532 if ((c = *previous) == next) return TRUE;
2709 nigel 93 #ifdef SUPPORT_UTF8
2710     if (utf8)
2711     {
2712     unsigned int othercase;
2713     if (next < 128) othercase = cd->fcc[next]; else
2714     #ifdef SUPPORT_UCP
2715 ph10 349 othercase = UCD_OTHERCASE(next);
2716 nigel 93 #else
2717     othercase = NOTACHAR;
2718     #endif
2719 ph10 532 return (unsigned int)c == othercase;
2720 nigel 93 }
2721     else
2722     #endif /* SUPPORT_UTF8 */
2723 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2724 nigel 93
2725 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2726     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2727    
2728 nigel 93 case OP_DIGIT:
2729     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2730    
2731     case OP_NOT_DIGIT:
2732     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2733    
2734     case OP_WHITESPACE:
2735     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2736    
2737     case OP_NOT_WHITESPACE:
2738     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2739    
2740     case OP_WORDCHAR:
2741     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2742    
2743     case OP_NOT_WORDCHAR:
2744     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2745    
2746 ph10 180 case OP_HSPACE:
2747     case OP_NOT_HSPACE:
2748     switch(next)
2749     {
2750     case 0x09:
2751     case 0x20:
2752     case 0xa0:
2753     case 0x1680:
2754     case 0x180e:
2755     case 0x2000:
2756     case 0x2001:
2757     case 0x2002:
2758     case 0x2003:
2759     case 0x2004:
2760     case 0x2005:
2761     case 0x2006:
2762     case 0x2007:
2763     case 0x2008:
2764     case 0x2009:
2765     case 0x200A:
2766     case 0x202f:
2767     case 0x205f:
2768     case 0x3000:
2769 ph10 528 return op_code == OP_NOT_HSPACE;
2770 ph10 180 default:
2771 ph10 528 return op_code != OP_NOT_HSPACE;
2772 ph10 180 }
2773    
2774 ph10 528 case OP_ANYNL:
2775 ph10 180 case OP_VSPACE:
2776     case OP_NOT_VSPACE:
2777     switch(next)
2778     {
2779     case 0x0a:
2780     case 0x0b:
2781     case 0x0c:
2782     case 0x0d:
2783     case 0x85:
2784     case 0x2028:
2785     case 0x2029:
2786 ph10 528 return op_code == OP_NOT_VSPACE;
2787 ph10 180 default:
2788 ph10 528 return op_code != OP_NOT_VSPACE;
2789 ph10 180 }
2790    
2791 ph10 532 #ifdef SUPPORT_UCP
2792     case OP_PROP:
2793     return check_char_prop(next, previous[0], previous[1], FALSE);
2794 ph10 535
2795 ph10 532 case OP_NOTPROP:
2796     return check_char_prop(next, previous[0], previous[1], TRUE);
2797     #endif
2798    
2799 nigel 93 default:
2800     return FALSE;
2801     }
2802    
2803    
2804 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2805     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2806     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2807     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2808 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2809 nigel 93
2810     switch(op_code)
2811     {
2812     case OP_CHAR:
2813 ph10 602 case OP_CHARI:
2814 ph10 535 #ifdef SUPPORT_UTF8
2815 ph10 532 GETCHARTEST(c, previous);
2816     #else
2817     c = *previous;
2818 ph10 535 #endif
2819 nigel 93 switch(-next)
2820     {
2821     case ESC_d:
2822 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2823 nigel 93
2824     case ESC_D:
2825 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2826 nigel 93
2827     case ESC_s:
2828 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2829 nigel 93
2830     case ESC_S:
2831 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2832 nigel 93
2833     case ESC_w:
2834 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2835 nigel 93
2836     case ESC_W:
2837 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2838 ph10 182
2839 ph10 180 case ESC_h:
2840     case ESC_H:
2841 ph10 532 switch(c)
2842 ph10 180 {
2843     case 0x09:
2844     case 0x20:
2845     case 0xa0:
2846     case 0x1680:
2847     case 0x180e:
2848     case 0x2000:
2849     case 0x2001:
2850     case 0x2002:
2851     case 0x2003:
2852     case 0x2004:
2853     case 0x2005:
2854     case 0x2006:
2855     case 0x2007:
2856     case 0x2008:
2857     case 0x2009:
2858     case 0x200A:
2859     case 0x202f:
2860     case 0x205f:
2861     case 0x3000:
2862     return -next != ESC_h;
2863     default:
2864     return -next == ESC_h;
2865 ph10 182 }
2866    
2867 ph10 180 case ESC_v:
2868     case ESC_V:
2869 ph10 532 switch(c)
2870 ph10 180 {
2871     case 0x0a:
2872     case 0x0b:
2873     case 0x0c:
2874     case 0x0d:
2875     case 0x85:
2876     case 0x2028:
2877     case 0x2029:
2878     return -next != ESC_v;
2879     default:
2880     return -next == ESC_v;
2881 ph10 182 }
2882 ph10 535
2883     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2884     their substitutions and process them. The result will always be either
2885 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2886 ph10 535
2887 ph10 532 #ifdef SUPPORT_UCP
2888     case ESC_du:
2889     case ESC_DU:
2890     case ESC_wu:
2891     case ESC_WU:
2892     case ESC_su:
2893     case ESC_SU:
2894     {
2895     int temperrorcode = 0;
2896     ptr = substitutes[-next - ESC_DU];
2897     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2898     if (temperrorcode != 0) return FALSE;
2899     ptr++; /* For compatibility */
2900     }
2901 ph10 535 /* Fall through */
2902 nigel 93
2903 ph10 532 case ESC_p:
2904     case ESC_P:
2905     {
2906     int ptype, pdata, errorcodeptr;
2907 ph10 535 BOOL negated;
2908    
2909 ph10 532 ptr--; /* Make ptr point at the p or P */
2910     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2911     if (ptype < 0) return FALSE;
2912     ptr++; /* Point past the final curly ket */
2913 ph10 535
2914 ph10 532 /* If the property item is optional, we have to give up. (When generated
2915     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2916     to the original \d etc. At this point, ptr will point to a zero byte. */
2917 ph10 535
2918 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2919     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2920     return FALSE;
2921 ph10 535
2922 ph10 532 /* Do the property check. */
2923 ph10 535
2924 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2925 ph10 535 }
2926 ph10 532 #endif
2927    
2928 nigel 93 default:
2929     return FALSE;
2930     }
2931    
2932 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2933     well. It means re-organizing the above code so as to get hold of the property
2934     values before switching on the op-code. However, I wonder how many patterns
2935     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2936     these op-codes are never generated.) */
2937    
2938 nigel 93 case OP_DIGIT:
2939 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2940 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2941 nigel 93
2942     case OP_NOT_DIGIT:
2943     return next == -ESC_d;
2944    
2945     case OP_WHITESPACE:
2946 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2947 nigel 93
2948     case OP_NOT_WHITESPACE:
2949 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2950 nigel 93
2951 ph10 180 case OP_HSPACE:
2952 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2953 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2954 ph10 180
2955     case OP_NOT_HSPACE:
2956     return next == -ESC_h;
2957 ph10 182
2958 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2959 ph10 535 case OP_ANYNL:
2960 ph10 182 case OP_VSPACE:
2961 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2962    
2963     case OP_NOT_VSPACE:
2964 ph10 528 return next == -ESC_v || next == -ESC_R;
2965 ph10 180
2966 nigel 93 case OP_WORDCHAR:
2967 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2968 ph10 528 next == -ESC_v || next == -ESC_R;
2969 nigel 93
2970     case OP_NOT_WORDCHAR:
2971     return next == -ESC_w || next == -ESC_d;
2972 ph10 182
2973 nigel 93 default:
2974     return FALSE;
2975     }
2976    
2977     /* Control does not reach here */
2978     }
2979    
2980    
2981    
2982     /*************************************************
2983 nigel 77 * Compile one branch *
2984     *************************************************/
2985    
2986 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2987 nigel 77 changed during the branch, the pointer is used to change the external options
2988 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2989     to find out the amount of memory needed, as well as during the real compile
2990     phase. The value of lengthptr distinguishes the two phases.
2991 nigel 77
2992     Arguments:
2993     optionsptr pointer to the option bits
2994     codeptr points to the pointer to the current code point
2995     ptrptr points to the current pattern pointer
2996     errorcodeptr points to error code variable
2997     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2998     reqbyteptr set to the last literal character required, else < 0
2999     bcptr points to current branch chain
3000     cd contains pointers to tables etc.
3001 nigel 93 lengthptr NULL during the real compile phase
3002     points to length accumulator during pre-compile phase
3003 nigel 77
3004     Returns: TRUE on success
3005     FALSE, with *errorcodeptr set non-zero on error
3006     */
3007    
3008     static BOOL
3009 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3010     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3011     compile_data *cd, int *lengthptr)
3012 nigel 77 {
3013     int repeat_type, op_type;
3014     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3015     int bravalue = 0;
3016     int greedy_default, greedy_non_default;
3017     int firstbyte, reqbyte;
3018     int zeroreqbyte, zerofirstbyte;
3019     int req_caseopt, reqvary, tempreqvary;
3020     int options = *optionsptr;
3021     int after_manual_callout = 0;
3022 nigel 93 int length_prevgroup = 0;
3023 nigel 77 register int c;
3024     register uschar *code = *codeptr;
3025 nigel 93 uschar *last_code = code;
3026     uschar *orig_code = code;
3027 nigel 77 uschar *tempcode;
3028     BOOL inescq = FALSE;
3029     BOOL groupsetfirstbyte = FALSE;
3030     const uschar *ptr = *ptrptr;
3031     const uschar *tempptr;
3032 ph10 518 const uschar *nestptr = NULL;
3033 nigel 77 uschar *previous = NULL;
3034     uschar *previous_callout = NULL;
3035 nigel 93 uschar *save_hwm = NULL;
3036 nigel 77 uschar classbits[32];
3037    
3038     #ifdef SUPPORT_UTF8
3039     BOOL class_utf8;
3040     BOOL utf8 = (options & PCRE_UTF8) != 0;
3041     uschar *class_utf8data;
3042 ph10 300 uschar *class_utf8data_base;
3043 nigel 77 uschar utf8_char[6];
3044     #else
3045     BOOL utf8 = FALSE;
3046 nigel 93 uschar *utf8_char = NULL;
3047 nigel 77 #endif
3048    
3049 ph10 475 #ifdef PCRE_DEBUG
3050 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3051     #endif
3052    
3053 nigel 77 /* Set up the default and non-default settings for greediness */
3054    
3055     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3056     greedy_non_default = greedy_default ^ 1;
3057    
3058     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3059     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3060     matches a non-fixed char first char; reqbyte just remains unset if we never
3061     find one.
3062    
3063     When we hit a repeat whose minimum is zero, we may have to adjust these values
3064     to take the zero repeat into account. This is implemented by setting them to
3065     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3066     item types that can be repeated set these backoff variables appropriately. */
3067    
3068     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3069    
3070     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3071     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3072     value > 255. It is added into the firstbyte or reqbyte variables to record the
3073     case status of the value. This is used only for ASCII characters. */
3074    
3075     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3076    
3077     /* Switch on next character until the end of the branch */
3078    
3079     for (;; ptr++)
3080     {
3081     BOOL negate_class;
3082 ph10 286 BOOL should_flip_negation;
3083 nigel 77 BOOL possessive_quantifier;
3084     BOOL is_quantifier;
3085 nigel 93 BOOL is_recurse;
3086 ph10 180 BOOL reset_bracount;
3087 nigel 77 int class_charcount;
3088     int class_lastchar;
3089     int newoptions;
3090     int recno;
3091 ph10 172 int refsign;
3092 nigel 77 int skipbytes;
3093     int subreqbyte;
3094     int subfirstbyte;
3095 nigel 93 int terminator;
3096 nigel 77 int mclength;
3097     uschar mcbuffer[8];
3098    
3099 nigel 93 /* Get next byte in the pattern */
3100 nigel 77
3101     c = *ptr;
3102 ph10 345
3103 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3104 ph10 518 string. Nesting only happens one level deep. */
3105    
3106     if (c == 0 && nestptr != NULL)
3107     {
3108     ptr = nestptr;
3109     nestptr = NULL;
3110     c = *ptr;
3111     }
3112    
3113 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3114     previous cycle of this loop. */
3115    
3116     if (lengthptr != NULL)
3117     {
3118 ph10 475 #ifdef PCRE_DEBUG
3119 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3120     #endif
3121 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3122 nigel 93 {
3123     *errorcodeptr = ERR52;
3124     goto FAILED;
3125     }
3126    
3127     /* There is at least one situation where code goes backwards: this is the
3128     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3129     the class is simply eliminated. However, it is created first, so we have to
3130     allow memory for it. Therefore, don't ever reduce the length at this point.
3131     */
3132    
3133     if (code < last_code) code = last_code;
3134 ph10 202
3135     /* Paranoid check for integer overflow */
3136    
3137     if (OFLOW_MAX - *lengthptr < code - last_code)
3138     {
3139     *errorcodeptr = ERR20;
3140     goto FAILED;
3141     }
3142    
3143 ph10 530 *lengthptr += (int)(code - last_code);
3144 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3145    
3146     /* If "previous" is set and it is not at the start of the work space, move
3147     it back to there, in order to avoid filling up the work space. Otherwise,
3148     if "previous" is NULL, reset the current code pointer to the start. */
3149    
3150     if (previous != NULL)
3151     {
3152     if (previous > orig_code)
3153     {
3154     memmove(orig_code, previous, code - previous);
3155     code -= previous - orig_code;
3156     previous = orig_code;
3157     }
3158     }
3159     else code = orig_code;
3160    
3161     /* Remember where this code item starts so we can pick up the length
3162     next time round. */
3163    
3164     last_code = code;
3165     }
3166    
3167     /* In the real compile phase, just check the workspace used by the forward
3168     reference list. */
3169    
3170 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3171 nigel 93 {
3172     *errorcodeptr = ERR52;
3173     goto FAILED;
3174     }
3175    
3176 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3177    
3178     if (inescq && c != 0)
3179     {
3180 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3181 nigel 77 {
3182     inescq = FALSE;
3183     ptr++;
3184     continue;
3185     }
3186     else
3187     {
3188     if (previous_callout != NULL)
3189     {
3190 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3191     complete_callout(previous_callout, ptr, cd);
3192 nigel 77 previous_callout = NULL;
3193     }
3194     if ((options & PCRE_AUTO_CALLOUT) != 0)
3195     {
3196     previous_callout = code;
3197     code = auto_callout(code, ptr, cd);
3198     }
3199     goto NORMAL_CHAR;
3200     }
3201     }
3202    
3203     /* Fill in length of a previous callout, except when the next thing is
3204     a quantifier. */
3205    
3206 ph10 392 is_quantifier =
3207 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3208     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3209 nigel 77
3210     if (!is_quantifier && previous_callout != NULL &&
3211     after_manual_callout-- <= 0)
3212     {
3213 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3214     complete_callout(previous_callout, ptr, cd);
3215 nigel 77 previous_callout = NULL;
3216     }
3217    
3218     /* In extended mode, skip white space and comments */
3219    
3220     if ((options & PCRE_EXTENDED) != 0)
3221     {
3222     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3223 ph10 391 if (c == CHAR_NUMBER_SIGN)
3224 nigel 77 {
3225 ph10 579 ptr++;
3226 ph10 556 while (*ptr != 0)
3227 nigel 91 {
3228 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3229 ph10 556 ptr++;
3230 ph10 579 #ifdef SUPPORT_UTF8
3231 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3232     #endif
3233 nigel 91 }
3234 nigel 93 if (*ptr != 0) continue;
3235    
3236 nigel 91 /* Else fall through to handle end of string */
3237     c = 0;
3238 nigel 77 }
3239     }
3240    
3241     /* No auto callout for quantifiers. */
3242    
3243     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3244     {
3245     previous_callout = code;
3246     code = auto_callout(code, ptr, cd);
3247     }
3248    
3249     switch(c)
3250     {
3251 nigel 93 /* ===================================================================*/
3252     case 0: /* The branch terminates at string end */
3253 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3254     case CHAR_RIGHT_PARENTHESIS:
3255 nigel 77 *firstbyteptr = firstbyte;
3256     *reqbyteptr = reqbyte;
3257     *codeptr = code;
3258     *ptrptr = ptr;
3259 nigel 93 if (lengthptr != NULL)
3260     {
3261 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3262     {
3263     *errorcodeptr = ERR20;
3264     goto FAILED;
3265     }
3266 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3267 nigel 93 DPRINTF((">> end branch\n"));
3268     }
3269 nigel 77 return TRUE;
3270    
3271 nigel 93
3272     /* ===================================================================*/
3273 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3274     the setting of any following char as a first character. */
3275    
3276 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3277 ph10 602 previous = NULL;
3278 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3279     {
3280     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3281 ph10 602 *code++ = OP_CIRCM;
3282 nigel 77 }
3283 ph10 602 else *code++ = OP_CIRC;
3284 nigel 77 break;
3285    
3286 ph10 391 case CHAR_DOLLAR_SIGN:
3287 nigel 77 previous = NULL;
3288 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3289 nigel 77 break;
3290    
3291     /* There can never be a first char if '.' is first, whatever happens about
3292     repeats. The value of reqbyte doesn't change either. */
3293    
3294 ph10 391 case CHAR_DOT:
3295 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3296     zerofirstbyte = firstbyte;
3297     zeroreqbyte = reqbyte;
3298     previous = code;
3299 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3300 nigel 77 break;
3301    
3302 nigel 93
3303     /* ===================================================================*/
3304 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3305     32-byte bitmap of the permitted characters, except in the special case
3306     where there is only one such character. For negated classes, we build the
3307     map as usual, then invert it at the end. However, we use a different opcode
3308     so that data characters > 255 can be handled correctly.
3309 nigel 77
3310     If the class contains characters outside the 0-255 range, a different
3311     opcode is compiled. It may optionally have a bit map for characters < 256,
3312     but those above are are explicitly listed afterwards. A flag byte tells
3313     whether the bitmap is present, and whether this is a negated class or not.
3314 ph10 345
3315 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3316     default (Perl) mode, it is treated as a data character. */
3317 ph10 345
3318 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3319 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3320     {
3321     *errorcodeptr = ERR64;
3322 ph10 345 goto FAILED;
3323 ph10 336 }
3324 ph10 345 goto NORMAL_CHAR;
3325 nigel 77
3326 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3327 nigel 77 previous = code;
3328    
3329     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3330     they are encountered at the top level, so we'll do that too. */
3331    
3332 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3333 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3334 ph10 295 check_posix_syntax(ptr, &tempptr))
3335 nigel 77 {
3336 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3337 nigel 77 goto FAILED;
3338     }
3339    
3340 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3341 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3342 ph10 205 skip them too. This makes for compatibility with Perl. */
3343 ph10 208
3344 ph10 205 negate_class = FALSE;
3345     for (;;)
3346 nigel 77 {
3347     c = *(++ptr);
3348 ph10 391 if (c == CHAR_BACKSLASH)
3349 ph10 205 {
3350 ph10 392 if (ptr[1] == CHAR_E)
3351 ph10 391 ptr++;
3352 ph10 392 else if (strncmp((const char *)ptr+1,
3353     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3354 ph10 391 ptr += 3;
3355 ph10 392 else
3356 ph10 391 break;
3357 ph10 205 }
3358 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3359 ph10 205 negate_class = TRUE;
3360     else break;
3361 ph10 208 }
3362 ph10 345
3363     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3364     an initial ']' is taken as a data character -- the code below handles
3365 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3366     [^] must match any character, so generate OP_ALLANY. */
3367 ph10 345
3368 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3369 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3370 ph10 341 {
3371     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3372     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3373     zerofirstbyte = firstbyte;
3374     break;
3375 ph10 345 }
3376 nigel 77
3377 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3378     negation flag at the end, so that support for characters > 255 works
3379 ph10 264 correctly (they are all included in the class). */
3380    
3381     should_flip_negation = FALSE;
3382    
3383 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3384 nigel 93 of just a single character (as long as it's < 256). However, For higher
3385     valued UTF-8 characters, we don't yet do any optimization. */
3386 nigel 77
3387     class_charcount = 0;
3388     class_lastchar = -1;
3389    
3390 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3391     temporary bit of memory, in case the class contains only 1 character (less
3392     than 256), because in that case the compiled code doesn't use the bit map.
3393     */
3394    
3395     memset(classbits, 0, 32 * sizeof(uschar));
3396    
3397 nigel 77 #ifdef SUPPORT_UTF8
3398     class_utf8 = FALSE; /* No chars >= 256 */
3399 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3400 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3401 nigel 77 #endif
3402    
3403     /* Process characters until ] is reached. By writing this as a "do" it
3404 nigel 93 means that an initial ] is taken as a data character. At the start of the
3405     loop, c contains the first byte of the character. */
3406 nigel 77
3407 nigel 93 if (c != 0) do
3408 nigel 77 {
3409 nigel 93 const uschar *oldptr;
3410    
3411 nigel 77 #ifdef SUPPORT_UTF8
3412     if (utf8 && c > 127)
3413     { /* Braces are required because the */
3414     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3415     }
3416 ph10 535
3417 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3418 ph10 309 data and reset the pointer. This is so that very large classes that
3419 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3420 ph10 309 (which is on the stack). */
3421    
3422 ph10 300 if (lengthptr != NULL)
3423     {
3424     *lengthptr += class_utf8data - class_utf8data_base;
3425 ph10 309 class_utf8data = class_utf8data_base;
3426     }
3427    
3428 nigel 77 #endif
3429    
3430     /* Inside \Q...\E everything is literal except \E */
3431    
3432     if (inescq)
3433     {
3434 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3435 nigel 77 {
3436 nigel 93 inescq = FALSE; /* Reset literal state */
3437     ptr++; /* Skip the 'E' */
3438     continue; /* Carry on with next */
3439 nigel 77 }
3440 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3441 nigel 77 }
3442    
3443     /* Handle POSIX class names. Perl allows a negation extension of the
3444     form [:^name:]. A square bracket that doesn't match the syntax is
3445     treated as a literal. We also recognize the POSIX constructions
3446     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3447     5.6 and 5.8 do. */
3448    
3449 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3450 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3451 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3452 nigel 77 {
3453     BOOL local_negate = FALSE;
3454 nigel 87 int posix_class, taboffset, tabopt;
3455 nigel 77 register const uschar *cbits = cd->cbits;
3456 nigel 87 uschar pbits[32];
3457 nigel 77
3458 ph10 391 if (ptr[1] != CHAR_COLON)
3459 nigel 77 {
3460     *errorcodeptr = ERR31;
3461     goto FAILED;
3462     }
3463    
3464     ptr += 2;
3465 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3466 nigel 77 {
3467     local_negate = TRUE;
3468 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3469 nigel 77 ptr++;
3470     }
3471    
3472 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3473 nigel 77 if (posix_class < 0)
3474     {
3475     *errorcodeptr = ERR30;
3476     goto FAILED;
3477     }
3478    
3479     /* If matching is caseless, upper and lower are converted to
3480     alpha. This relies on the fact that the class table starts with
3481     alpha, lower, upper as the first 3 entries. */
3482    
3483     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3484     posix_class = 0;
3485 ph10 535
3486     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3487 ph10 518 different escape sequences that use Unicode properties. */
3488 ph10 535
3489 ph10 518 #ifdef SUPPORT_UCP
3490     if ((options & PCRE_UCP) != 0)
3491     {
3492     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3493     if (posix_substitutes[pc] != NULL)
3494     {
3495 ph10 535 nestptr = tempptr + 1;
3496 ph10 518 ptr = posix_substitutes[pc] - 1;
3497 ph10 535 continue;
3498     }
3499     }
3500     #endif
3501 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3502     chunk of local store because we may be adding and subtracting from it,
3503     and we don't want to subtract bits that may be in the main map already.
3504     At the end we or the result into the bit map that is being built. */
3505 nigel 77
3506     posix_class *= 3;
3507 nigel 87
3508     /* Copy in the first table (always present) */
3509    
3510     memcpy(pbits, cbits + posix_class_maps[posix_class],
3511     32 * sizeof(uschar));
3512    
3513     /* If there is a second table, add or remove it as required. */
3514    
3515     taboffset = posix_class_maps[posix_class + 1];
3516     tabopt = posix_class_maps[posix_class + 2];
3517    
3518     if (taboffset >= 0)
3519 nigel 77 {
3520 nigel 87 if (tabopt >= 0)
3521     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3522 nigel 77 else
3523 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3524 nigel 77 }
3525    
3526 nigel 87 /* Not see if we need to remove any special characters. An option
3527     value of 1 removes vertical space and 2 removes underscore. */
3528    
3529     if (tabopt < 0) tabopt = -tabopt;
3530     if (tabopt == 1) pbits[1] &= ~0x3c;
3531     else if (tabopt == 2) pbits[11] &= 0x7f;
3532    
3533     /* Add the POSIX table or its complement into the main table that is
3534     being built and we are done. */
3535    
3536     if (local_negate)
3537     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3538     else
3539     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3540    
3541 nigel 77 ptr = tempptr + 1;
3542     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3543     continue; /* End of POSIX syntax handling */
3544     }
3545    
3546     /* Backslash may introduce a single character, or it may introduce one
3547 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3548 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3549     assume that other escapes have more than one character in them, so set
3550     class_charcount bigger than one. Unrecognized escapes fall through and
3551     are either treated as literal characters (by default), or are faulted if
3552     PCRE_EXTRA is set. */
3553 nigel 77
3554 ph10 391 if (c == CHAR_BACKSLASH)
3555 nigel 77 {
3556 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3557     if (*errorcodeptr != 0) goto FAILED;
3558 nigel 77
3559 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3560 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3561     {
3562 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3563 nigel 77 {
3564     ptr += 2; /* avoid empty string */
3565     }
3566     else inescq = TRUE;
3567     continue;
3568     }
3569 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3570 nigel 77
3571     if (c < 0)
3572     {
3573     register const uschar *cbits = cd->cbits;
3574     class_charcount += 2; /* Greater than 1 is what matters */
3575 nigel 93
3576 ph10 518 switch (-c)
3577 nigel 77 {
3578 ph10 518 #ifdef SUPPORT_UCP
3579     case ESC_du: /* These are the values given for \d etc */
3580     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3581     case ESC_wu: /* escape sequence with an appropriate \p */
3582     case ESC_WU: /* or \P to test Unicode properties instead */
3583     case ESC_su: /* of the default ASCII testing. */
3584     case ESC_SU:
3585     nestptr = ptr;
3586     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3587 ph10 535 class_charcount -= 2; /* Undo! */
3588 ph10 518 continue;
3589     #endif
3590 nigel 77 case ESC_d:
3591     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3592     continue;
3593    
3594     case ESC_D:
3595 ph10 286 should_flip_negation = TRUE;
3596 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3597     continue;
3598    
3599     case ESC_w:
3600     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3601     continue;
3602    
3603     case ESC_W:
3604 ph10 286 should_flip_negation = TRUE;
3605 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3606     continue;
3607    
3608 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3609 ph10 579 if it was previously set by something earlier in the character
3610     class. */
3611 ph10 552
3612 nigel 77 case ESC_s:
3613 ph10 552 classbits[0] |= cbits[cbit_space];
3614 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3615 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3616 nigel 77 continue;
3617    
3618     case ESC_S:
3619 ph10 286 should_flip_negation = TRUE;
3620 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3621     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3622     continue;
3623    
3624 ph10 518 case ESC_h:
3625 ph10 178 SETBIT(classbits, 0x09); /* VT */
3626     SETBIT(classbits, 0x20); /* SPACE */
3627 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3628 ph10 178 #ifdef SUPPORT_UTF8
3629     if (utf8)
3630 ph10 180 {
3631 ph10 178 class_utf8 = TRUE;
3632     *class_utf8data++ = XCL_SINGLE;
3633 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3634 ph10 178 *class_utf8data++ = XCL_SINGLE;
3635 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3636     *class_utf8data++ = XCL_RANGE;
3637     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3638     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3639 ph10 178 *class_utf8data++ = XCL_SINGLE;
3640 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3641 ph10 178 *class_utf8data++ = XCL_SINGLE;
3642 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3643 ph10 178 *class_utf8data++ = XCL_SINGLE;
3644 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3645     }
3646     #endif
3647     continue;
3648 nigel 93
3649 ph10 518 case ESC_H:
3650 ph10 178 for (c = 0; c < 32; c++)
3651     {
3652     int x = 0xff;
3653     switch (c)
3654 ph10 180 {
3655 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3656     case 0x20/8: x ^= 1 << (0x20%8); break;
3657     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3658     default: break;
3659     }
3660     classbits[c] |= x;
3661 ph10 180 }
3662    
3663 ph10 178 #ifdef SUPPORT_UTF8
3664     if (utf8)
3665 ph10 180 {
3666 ph10 178 class_utf8 = TRUE;
3667 ph10 180 *class_utf8data++ = XCL_RANGE;
3668     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3669     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3670     *class_utf8data++ = XCL_RANGE;
3671     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3672     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3673     *class_utf8data++ = XCL_RANGE;
3674     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3675     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3676     *class_utf8data++ = XCL_RANGE;
3677     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3678     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3679     *class_utf8data++ = XCL_RANGE;
3680     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3681     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3682     *class_utf8data++ = XCL_RANGE;
3683     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3684     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3685     *class_utf8data++ = XCL_RANGE;
3686     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3687     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3688     }
3689     #endif
3690     continue;
3691 ph10 178
3692 ph10 518 case ESC_v:
3693 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3694     SETBIT(classbits, 0x0b); /* VT */
3695 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3696     SETBIT(classbits, 0x0d); /* CR */
3697     SETBIT(classbits, 0x85); /* NEL */
3698 ph10 178 #ifdef SUPPORT_UTF8
3699     if (utf8)
3700 ph10 180 {
3701 ph10 178 class_utf8 = TRUE;
3702 ph10 180 *class_utf8data++ = XCL_RANGE;
3703     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3704     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3705     }
3706     #endif
3707     continue;
3708 ph10 178
3709 ph10 518 case ESC_V:
3710 ph10 178 for (c = 0; c < 32; c++)
3711     {
3712     int x = 0xff;
3713     switch (c)
3714 ph10 180 {
3715 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3716     x ^= 1 << (0x0b%8);
3717     x ^= 1 << (0x0c%8);
3718 ph10 180 x ^= 1 << (0x0d%8);
3719 ph10 178 break;
3720     case 0x85/8: x ^= 1 << (0x85%8); break;
3721     default: break;
3722     }
3723     classbits[c] |= x;
3724 ph10 180 }
3725    
3726 ph10 178 #ifdef SUPPORT_UTF8
3727     if (utf8)
3728 ph10 180 {
3729 ph10 178 class_utf8 = TRUE;
3730 ph10 180 *class_utf8data++ = XCL_RANGE;
3731     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3732     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3733     *class_utf8data++ = XCL_RANGE;
3734     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3735     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3736     }
3737     #endif
3738     continue;
3739 ph10 178
3740 nigel 77 #ifdef SUPPORT_UCP
3741 ph10 518 case ESC_p:
3742     case ESC_P:
3743     {
3744     BOOL negated;
3745     int pdata;
3746     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3747     if (ptype < 0) goto FAILED;
3748     class_utf8 = TRUE;
3749     *class_utf8data++ = ((-c == ESC_p) != negated)?
3750     XCL_PROP : XCL_NOTPROP;
3751     *class_utf8data++ = ptype;
3752     *class_utf8data++ = pdata;
3753     class_charcount -= 2; /* Not a < 256 character */
3754     continue;
3755     }
3756 nigel 77 #endif
3757 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3758     strict mode. By default, for compatibility with Perl, they are
3759     treated as literals. */
3760 nigel 77
3761 ph10 518 default:
3762     if ((options & PCRE_EXTRA) != 0)
3763     {
3764     *errorcodeptr = ERR7;
3765     goto FAILED;
3766     }
3767     class_charcount -= 2; /* Undo the default count from above */
3768     c = *ptr; /* Get the final character and fall through */
3769     break;
3770 nigel 93 }
3771 nigel 77 }
3772    
3773     /* Fall through if we have a single character (c >= 0). This may be
3774 nigel 93 greater than 256 in UTF-8 mode. */
3775 nigel 77
3776     } /* End of backslash handling */
3777    
3778     /* A single character may be followed by '-' to form a range. However,
3779     Perl does not permit ']' to be the end of the range. A '-' character
3780 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3781     entirely. The code for handling \Q and \E is messy. */
3782 nigel 77
3783 nigel 93 CHECK_RANGE:
3784 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3785 nigel 77 {
3786 nigel 93 inescq = FALSE;
3787     ptr += 2;
3788     }
3789    
3790     oldptr = ptr;
3791 ph10 231
3792 ph10 230 /* Remember \r or \n */
3793 ph10 231
3794 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3795 ph10 231
3796 ph10 230 /* Check for range */
3797 nigel 93
3798 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3799 nigel 93 {
3800 nigel 77 int d;
3801     ptr += 2;
3802 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3803 nigel 77
3804 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3805     mode. */
3806    
3807 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3808 nigel 93 {
3809     ptr += 2;
3810 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3811 ph10 391 { ptr += 2; continue; }
3812 nigel 93 inescq = TRUE;
3813     break;
3814     }
3815    
3816 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3817 nigel 93 {
3818     ptr = oldptr;
3819     goto LONE_SINGLE_CHARACTER;
3820     }
3821    
3822 nigel 77 #ifdef SUPPORT_UTF8
3823     if (utf8)
3824     { /* Braces are required because the */
3825     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3826     }
3827     else
3828     #endif
3829     d = *ptr; /* Not UTF-8 mode */
3830    
3831     /* The second part of a range can be a single-character escape, but
3832     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3833     in such circumstances. */
3834    
3835 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3836 nigel 77 {
3837 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3838     if (*errorcodeptr != 0) goto FAILED;
3839 nigel 77
3840 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3841 nigel 77
3842     if (d < 0)
3843     {
3844 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3845 nigel 77 {
3846 nigel 93 ptr = oldptr;
3847 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3848     }
3849     }
3850     }
3851    
3852 nigel 93 /* Check that the two values are in the correct order. Optimize
3853     one-character ranges */
3854 nigel 77
3855 nigel 93 if (d < c)
3856     {
3857     *errorcodeptr = ERR8;
3858     goto FAILED;
3859     }
3860    
3861 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3862    
3863 ph10 230 /* Remember \r or \n */
3864 ph10 231
3865 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3866 ph10 231
3867 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3868     matching, we have to use an XCLASS with extra data items. Caseless
3869     matching for characters > 127 is available only if UCP support is
3870     available. */
3871    
3872     #ifdef SUPPORT_UTF8
3873     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3874     {
3875     class_utf8 = TRUE;
3876    
3877     /* With UCP support, we can find the other case equivalents of
3878     the relevant characters. There may be several ranges. Optimize how
3879     they fit with the basic range. */
3880    
3881     #ifdef SUPPORT_UCP
3882     if ((options & PCRE_CASELESS) != 0)
3883     {
3884 nigel 93 unsigned int occ, ocd;
3885     unsigned int cc = c;
3886     unsigned int origd = d;
3887 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3888     {
3889 ph10 180 if (occ >= (unsigned int)c &&
3890     ocd <= (unsigned int)d)
3891 ph10 176 continue; /* Skip embedded ranges */
3892 nigel 77
3893 ph10 180 if (occ < (unsigned int)c &&
3894 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3895 nigel 77 { /* if there is overlap, */
3896     c = occ; /* noting that if occ < c */
3897     continue; /* we can't have ocd > d */
3898     } /* because a subrange is */
3899 ph10 180 if (ocd > (unsigned int)d &&
3900 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3901 nigel 77 { /* the basic range. */
3902     d = ocd;
3903     continue;
3904     }
3905    
3906     if (occ == ocd)
3907     {
3908     *class_utf8data++ = XCL_SINGLE;
3909     }
3910     else
3911     {
3912     *class_utf8data++ = XCL_RANGE;
3913     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3914     }
3915     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3916     }
3917     }