/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 760 - (hide annotations) (download)
Tue Nov 22 11:23:43 2011 UTC (18 months ago) by ph10
File MIME type: text/plain
File size: 251631 byte(s)
Test for workspace overflow with forward reference data.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
397 ph10 240 "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413 ph10 747 /* 70 */
414     "internal error: unknown opcode in find_fixedlength()\0"
415 ph10 758 "\\N is not supported in a class\0"
416 ph10 760 "too many forward references\0"
417 ph10 510 ;
418 nigel 77
419     /* Table to identify digits and hex digits. This is used when compiling
420     patterns. Note that the tables in chartables are dependent on the locale, and
421     may mark arbitrary characters as digits - but the PCRE compiling code expects
422     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
423     a private table here. It costs 256 bytes, but it is a lot faster than doing
424     character value tests (at least in some simple cases I timed), and in some
425     applications one wants PCRE to compile efficiently as well as match
426     efficiently.
427    
428     For convenience, we use the same bit definitions as in chartables:
429    
430     0x04 decimal digit
431     0x08 hexadecimal digit
432    
433     Then we can use ctype_digit and ctype_xdigit in the code. */
434    
435 ph10 392 #ifndef EBCDIC
436 ph10 391
437 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
438 ph10 391 UTF-8 mode. */
439    
440 nigel 77 static const unsigned char digitab[] =
441     {
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
448     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
449     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
450     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
454     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
470     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
472     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
473     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
474    
475 ph10 392 #else
476 ph10 391
477     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
478    
479 nigel 77 static const unsigned char digitab[] =
480     {
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
492 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
493 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
497     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
505     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
507     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
508     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
509     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
510     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
511     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
512     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
513    
514     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
515     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
516     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
517     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
518     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
519     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
521     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
522     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
523     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
524     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
526 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
527 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
528     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
529     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
530     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
531     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
532     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
533     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
534     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
535     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
537     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
538     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
539     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
540     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
541     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
542     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
543     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
544     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
545     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
546     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
547     #endif
548    
549    
550     /* Definition to allow mutual recursion */
551    
552     static BOOL
553 ph10 642 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
554     int *, int *, branch_chain *, compile_data *, int *);
555 nigel 77
556    
557    
558     /*************************************************
559 ph10 240 * Find an error text *
560     *************************************************/
561    
562 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
563     some of the text is of unknown length, we can't use a table of offsets.
564     Instead, just count through the strings. This is not a performance issue
565 ph10 240 because it happens only when there has been a compilation error.
566    
567     Argument: the error number
568     Returns: pointer to the error string
569     */
570    
571     static const char *
572     find_error_text(int n)
573     {
574     const char *s = error_texts;
575 ph10 507 for (; n > 0; n--)
576 ph10 499 {
577     while (*s++ != 0) {};
578     if (*s == 0) return "Error text not found (please report)";
579 ph10 507 }
580 ph10 240 return s;
581     }
582    
583    
584     /*************************************************
585 ph10 640 * Check for counted repeat *
586     *************************************************/
587    
588     /* This function is called when a '{' is encountered in a place where it might
589     start a quantifier. It looks ahead to see if it really is a quantifier or not.
590     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
591     where the ddds are digits.
592    
593     Arguments:
594     p pointer to the first char after '{'
595    
596     Returns: TRUE or FALSE
597     */
598    
599     static BOOL
600     is_counted_repeat(const uschar *p)
601     {
602     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
603     while ((digitab[*p] & ctype_digit) != 0) p++;
604     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
605    
606     if (*p++ != CHAR_COMMA) return FALSE;
607     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
608    
609     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
610     while ((digitab[*p] & ctype_digit) != 0) p++;
611    
612     return (*p == CHAR_RIGHT_CURLY_BRACKET);
613     }
614    
615    
616    
617     /*************************************************
618 nigel 77 * Handle escapes *
619     *************************************************/
620    
621     /* This function is called when a \ has been encountered. It either returns a
622     positive value for a simple escape such as \n, or a negative value which
623 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
624     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
625     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
626     ptr is pointing at the \. On exit, it is on the final character of the escape
627     sequence.
628 nigel 77
629     Arguments:
630     ptrptr points to the pattern position pointer
631     errorcodeptr points to the errorcode variable
632     bracount number of previous extracting brackets
633     options the options bits
634     isclass TRUE if inside a character class
635    
636     Returns: zero or positive => a data character
637     negative => a special escape sequence
638 ph10 213 on error, errorcodeptr is set
639 nigel 77 */
640    
641     static int
642     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
643     int options, BOOL isclass)
644     {
645 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
646     const uschar *ptr = *ptrptr + 1;
647 nigel 77 int c, i;
648    
649 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
650     ptr--; /* Set pointer back to the last byte */
651    
652 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
653    
654     if (c == 0) *errorcodeptr = ERR1;
655    
656 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
657     in a table. A non-zero result is something that can be returned immediately.
658 nigel 77 Otherwise further processing may be required. */
659    
660 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
661     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
662     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
663 nigel 77
664 ph10 97 #else /* EBCDIC coding */
665 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
666 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
667     #endif
668    
669     /* Escapes that need further processing, or are illegal. */
670    
671     else
672     {
673     const uschar *oldptr;
674 nigel 93 BOOL braced, negated;
675    
676 nigel 77 switch (c)
677     {
678     /* A number of Perl escapes are not handled by PCRE. We give an explicit
679     error. */
680    
681 ph10 391 case CHAR_l:
682     case CHAR_L:
683 zherczeg 744 *errorcodeptr = ERR37;
684     break;
685    
686 ph10 391 case CHAR_u:
687 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
688     {
689     /* In JavaScript, \u must be followed by four hexadecimal numbers.
690     Otherwise it is a lowercase u letter. */
691     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
692     && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
693     {
694     c = 0;
695     for (i = 0; i < 4; ++i)
696     {
697     register int cc = *(++ptr);
698     #ifndef EBCDIC /* ASCII/UTF-8 coding */
699     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
700     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
701     #else /* EBCDIC coding */
702     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
703     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
704     #endif
705     }
706     }
707     }
708     else
709     *errorcodeptr = ERR37;
710     break;
711    
712 ph10 391 case CHAR_U:
713 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
714     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
715 nigel 77 break;
716    
717 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
718 ph10 640 class, \g must be followed by one of a number of specific things:
719 ph10 345
720 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
721     backreference. If negative, it is a relative backreference. This is a Perl
722     5.10 feature.
723 ph10 345
724 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
725     is part of Perl's movement towards a unified syntax for back references. As
726     this is synonymous with \k{name}, we fudge it up by pretending it really
727     was \k.
728 ph10 345
729     (3) For Oniguruma compatibility we also support \g followed by a name or a
730     number either in angle brackets or in single quotes. However, these are
731     (possibly recursive) subroutine calls, _not_ backreferences. Just return
732 ph10 333 the -ESC_g code (cf \k). */
733 nigel 93
734 ph10 391 case CHAR_g:
735 ph10 640 if (isclass) break;
736 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
737 ph10 333 {
738     c = -ESC_g;
739 ph10 345 break;
740     }
741 ph10 333
742     /* Handle the Perl-compatible cases */
743 ph10 345
744 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
745 nigel 93 {
746 ph10 171 const uschar *p;
747 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
748     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
749     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
750 ph10 171 {
751     c = -ESC_k;
752     break;
753 ph10 172 }
754 nigel 93 braced = TRUE;
755     ptr++;
756     }
757     else braced = FALSE;
758    
759 ph10 391 if (ptr[1] == CHAR_MINUS)
760 nigel 93 {
761     negated = TRUE;
762     ptr++;
763     }
764     else negated = FALSE;
765    
766     c = 0;
767     while ((digitab[ptr[1]] & ctype_digit) != 0)
768 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
769 ph10 220
770 ph10 333 if (c < 0) /* Integer overflow */
771 ph10 213 {
772     *errorcodeptr = ERR61;
773     break;
774 ph10 220 }
775 ph10 345
776 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
777 nigel 93 {
778     *errorcodeptr = ERR57;
779 ph10 213 break;
780 nigel 93 }
781 ph10 345
782 ph10 333 if (c == 0)
783     {
784     *errorcodeptr = ERR58;
785     break;
786 ph10 345 }
787 nigel 93
788     if (negated)
789     {
790     if (c > bracount)
791     {
792     *errorcodeptr = ERR15;
793 ph10 213 break;
794 nigel 93 }
795     c = bracount - (c - 1);
796     }
797    
798     c = -(ESC_REF + c);
799     break;
800    
801 nigel 77 /* The handling of escape sequences consisting of a string of digits
802     starting with one that is not zero is not straightforward. By experiment,
803     the way Perl works seems to be as follows:
804    
805     Outside a character class, the digits are read as a decimal number. If the
806     number is less than 10, or if there are that many previous extracting
807     left brackets, then it is a back reference. Otherwise, up to three octal
808     digits are read to form an escaped byte. Thus \123 is likely to be octal
809     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
810     value is greater than 377, the least significant 8 bits are taken. Inside a
811     character class, \ followed by a digit is always an octal number. */
812    
813 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
814     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
815 nigel 77
816     if (!isclass)
817     {
818     oldptr = ptr;
819 ph10 391 c -= CHAR_0;
820 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
821 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
822 ph10 333 if (c < 0) /* Integer overflow */
823 ph10 213 {
824     *errorcodeptr = ERR61;
825 ph10 220 break;
826     }
827 nigel 77 if (c < 10 || c <= bracount)
828     {
829     c = -(ESC_REF + c);
830     break;
831     }
832     ptr = oldptr; /* Put the pointer back and fall through */
833     }
834    
835     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
836     generates a binary zero byte and treats the digit as a following literal.
837     Thus we have to pull back the pointer by one. */
838    
839 ph10 391 if ((c = *ptr) >= CHAR_8)
840 nigel 77 {
841     ptr--;
842     c = 0;
843     break;
844     }
845    
846     /* \0 always starts an octal number, but we may drop through to here with a
847 nigel 91 larger first octal digit. The original code used just to take the least
848     significant 8 bits of octal numbers (I think this is what early Perls used
849     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
850     than 3 octal digits. */
851 nigel 77
852 ph10 391 case CHAR_0:
853     c -= CHAR_0;
854     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
855     c = c * 8 + *(++ptr) - CHAR_0;
856 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
857 nigel 77 break;
858    
859 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
860     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
861     treated as a data character. */
862 nigel 77
863 ph10 391 case CHAR_x:
864 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
865     {
866     /* In JavaScript, \x must be followed by two hexadecimal numbers.
867     Otherwise it is a lowercase x letter. */
868     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
869     {
870     c = 0;
871     for (i = 0; i < 2; ++i)
872     {
873     register int cc = *(++ptr);
874     #ifndef EBCDIC /* ASCII/UTF-8 coding */
875     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
876     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
877     #else /* EBCDIC coding */
878     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
879     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
880     #endif
881     }
882     }
883     break;
884     }
885    
886 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
887 nigel 77 {
888     const uschar *pt = ptr + 2;
889 nigel 87 int count = 0;
890    
891 nigel 77 c = 0;
892     while ((digitab[*pt] & ctype_xdigit) != 0)
893     {
894 nigel 87 register int cc = *pt++;
895 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
896 nigel 77 count++;
897 nigel 87
898 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
899     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
900     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
901 ph10 97 #else /* EBCDIC coding */
902 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
903     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
904 nigel 77 #endif
905     }
906 nigel 87
907 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
908 nigel 77 {
909 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
910 nigel 77 ptr = pt;
911     break;
912     }
913 nigel 87
914 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
915     recognize this construct; fall through to the normal \x handling. */
916     }
917    
918 nigel 87 /* Read just a single-byte hex-defined char */
919 nigel 77
920     c = 0;
921     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
922     {
923 ph10 391 int cc; /* Some compilers don't like */
924     cc = *(++ptr); /* ++ in initializers */
925     #ifndef EBCDIC /* ASCII/UTF-8 coding */
926     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
927     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
928 ph10 97 #else /* EBCDIC coding */
929 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
930     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
931 nigel 77 #endif
932     }
933     break;
934    
935 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
936 ph10 574 An error is given if the byte following \c is not an ASCII character. This
937     coding is ASCII-specific, but then the whole concept of \cx is
938 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
939 nigel 77
940 ph10 391 case CHAR_c:
941 nigel 77 c = *(++ptr);
942     if (c == 0)
943     {
944     *errorcodeptr = ERR2;
945 ph10 213 break;
946 nigel 77 }
947 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
948     if (c > 127) /* Excludes all non-ASCII in either mode */
949     {
950     *errorcodeptr = ERR68;
951 ph10 579 break;
952     }
953 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
954 nigel 77 c ^= 0x40;
955 ph10 574 #else /* EBCDIC coding */
956 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
957 nigel 77 c ^= 0xC0;
958     #endif
959     break;
960    
961     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
962 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
963     otherwise, for Perl compatibility, it is a literal. This code looks a bit
964     odd, but there used to be some cases other than the default, and there may
965     be again in future, so I haven't "optimized" it. */
966 nigel 77
967     default:
968     if ((options & PCRE_EXTRA) != 0) switch(c)
969     {
970     default:
971     *errorcodeptr = ERR3;
972     break;
973     }
974     break;
975     }
976     }
977 ph10 518
978     /* Perl supports \N{name} for character names, as well as plain \N for "not
979 ph10 654 newline". PCRE does not support \N{name}. However, it does support
980 ph10 640 quantification such as \N{2,3}. */
981 nigel 77
982 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
983     !is_counted_repeat(ptr+2))
984 ph10 518 *errorcodeptr = ERR37;
985 ph10 514
986 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
987    
988     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
989     c -= (ESC_DU - ESC_D);
990    
991     /* Set the pointer to the final character before returning. */
992    
993 nigel 77 *ptrptr = ptr;
994     return c;
995     }
996    
997    
998    
999     #ifdef SUPPORT_UCP
1000     /*************************************************
1001     * Handle \P and \p *
1002     *************************************************/
1003    
1004     /* This function is called after \P or \p has been encountered, provided that
1005     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1006     pointing at the P or p. On exit, it is pointing at the final character of the
1007     escape sequence.
1008    
1009     Argument:
1010     ptrptr points to the pattern position pointer
1011     negptr points to a boolean that is set TRUE for negation else FALSE
1012 nigel 87 dptr points to an int that is set to the detailed property value
1013 nigel 77 errorcodeptr points to the error code variable
1014    
1015 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1016 nigel 77 */
1017    
1018     static int
1019 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1020 nigel 77 {
1021     int c, i, bot, top;
1022     const uschar *ptr = *ptrptr;
1023 nigel 87 char name[32];
1024 nigel 77
1025     c = *(++ptr);
1026     if (c == 0) goto ERROR_RETURN;
1027    
1028     *negptr = FALSE;
1029    
1030 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1031     negation. */
1032 nigel 77
1033 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1034 nigel 77 {
1035 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1036 nigel 77 {
1037     *negptr = TRUE;
1038     ptr++;
1039     }
1040 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
1041 nigel 77 {
1042     c = *(++ptr);
1043     if (c == 0) goto ERROR_RETURN;
1044 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1045 nigel 77 name[i] = c;
1046     }
1047 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1048 nigel 77 name[i] = 0;
1049     }
1050    
1051     /* Otherwise there is just one following character */
1052    
1053     else
1054     {
1055     name[0] = c;
1056     name[1] = 0;
1057     }
1058    
1059     *ptrptr = ptr;
1060    
1061     /* Search for a recognized property name using binary chop */
1062    
1063     bot = 0;
1064     top = _pcre_utt_size;
1065    
1066     while (bot < top)
1067     {
1068 nigel 87 i = (bot + top) >> 1;
1069 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1070 nigel 87 if (c == 0)
1071     {
1072     *dptr = _pcre_utt[i].value;
1073     return _pcre_utt[i].type;
1074     }
1075 nigel 77 if (c > 0) bot = i + 1; else top = i;
1076     }
1077    
1078     *errorcodeptr = ERR47;
1079     *ptrptr = ptr;
1080     return -1;
1081    
1082     ERROR_RETURN:
1083     *errorcodeptr = ERR46;
1084     *ptrptr = ptr;
1085     return -1;
1086     }
1087     #endif
1088    
1089    
1090    
1091    
1092     /*************************************************
1093     * Read repeat counts *
1094     *************************************************/
1095    
1096     /* Read an item of the form {n,m} and return the values. This is called only
1097     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1098     so the syntax is guaranteed to be correct, but we need to check the values.
1099    
1100     Arguments:
1101     p pointer to first char after '{'
1102     minp pointer to int for min
1103     maxp pointer to int for max
1104     returned as -1 if no max
1105     errorcodeptr points to error code variable
1106    
1107     Returns: pointer to '}' on success;
1108     current ptr on error, with errorcodeptr set non-zero
1109     */
1110    
1111     static const uschar *
1112     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1113     {
1114     int min = 0;
1115     int max = -1;
1116    
1117 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1118     an integer overflow. */
1119    
1120 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1121 nigel 81 if (min < 0 || min > 65535)
1122     {
1123     *errorcodeptr = ERR5;
1124     return p;
1125     }
1126 nigel 77
1127 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1128     Also, max must not be less than min. */
1129    
1130 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1131 nigel 77 {
1132 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1133 nigel 77 {
1134     max = 0;
1135 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1136 nigel 81 if (max < 0 || max > 65535)
1137     {
1138     *errorcodeptr = ERR5;
1139     return p;
1140     }
1141 nigel 77 if (max < min)
1142     {
1143     *errorcodeptr = ERR4;
1144     return p;
1145     }
1146     }
1147     }
1148    
1149 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1150     '}'. */
1151 nigel 77
1152 nigel 81 *minp = min;
1153     *maxp = max;
1154 nigel 77 return p;
1155     }
1156    
1157    
1158    
1159     /*************************************************
1160 ph10 408 * Subroutine for finding forward reference *
1161 nigel 91 *************************************************/
1162    
1163 ph10 408 /* This recursive function is called only from find_parens() below. The
1164     top-level call starts at the beginning of the pattern. All other calls must
1165     start at a parenthesis. It scans along a pattern's text looking for capturing
1166 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1167     name it is given, it returns its number. Alternatively, if the name is NULL, it
1168 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1169     track of subpatterns that reset the capturing group numbers - the (?| feature.
1170 nigel 91
1171 ph10 578 This function was originally called only from the second pass, in which we know
1172     that if (?< or (?' or (?P< is encountered, the name will be correctly
1173     terminated because that is checked in the first pass. There is now one call to
1174     this function in the first pass, to check for a recursive back reference by
1175     name (so that we can make the whole group atomic). In this case, we need check
1176 ph10 579 only up to the current position in the pattern, and that is still OK because
1177     and previous occurrences will have been checked. To make this work, the test
1178     for "end of pattern" is a check against cd->end_pattern in the main loop,
1179 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1180 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1181     processing items within the loop are OK, because afterwards the main loop will
1182 ph10 578 terminate.)
1183    
1184 nigel 91 Arguments:
1185 ph10 408 ptrptr address of the current character pointer (updated)
1186 ph10 345 cd compile background data
1187 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1188     lorn name length, or subpattern number if name is NULL
1189     xmode TRUE if we are in /x mode
1190 ph10 579 utf8 TRUE if we are in UTF-8 mode
1191 ph10 411 count pointer to the current capturing subpattern number (updated)
1192 nigel 91
1193     Returns: the number of the named subpattern, or -1 if not found
1194     */
1195    
1196     static int
1197 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1198 ph10 556 BOOL xmode, BOOL utf8, int *count)
1199 nigel 91 {
1200 ph10 408 uschar *ptr = *ptrptr;
1201     int start_count = *count;
1202     int hwm_count = start_count;
1203     BOOL dup_parens = FALSE;
1204 nigel 93
1205 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1206 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1207    
1208     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1209     {
1210 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1211 ph10 545
1212 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1213 ph10 545
1214 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1215 ph10 408
1216 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1217 ph10 408 {
1218     *count += 1;
1219     if (name == NULL && *count == lorn) return *count;
1220 ph10 411 ptr++;
1221 ph10 408 }
1222    
1223 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1224     where the parenthesis numbers are duplicated. */
1225    
1226     else if (ptr[2] == CHAR_VERTICAL_LINE)
1227     {
1228     ptr += 3;
1229     dup_parens = TRUE;
1230     }
1231 ph10 545
1232 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1233    
1234     else if (ptr[2] == CHAR_NUMBER_SIGN)
1235     {
1236     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1237     goto FAIL_EXIT;
1238 ph10 545 }
1239 ph10 544
1240 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1241     is processed as normal. If not, skip to the closing parenthesis of the
1242 ph10 544 condition (there can't be any nested parens). */
1243 ph10 411
1244 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1245     {
1246 ph10 411 ptr += 2;
1247 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1248     {
1249     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1250 ph10 411 if (*ptr != 0) ptr++;
1251 ph10 408 }
1252 ph10 411 }
1253    
1254 ph10 544 /* Start with (? but not a condition. */
1255 ph10 408
1256     else
1257 ph10 411 {
1258 ph10 408 ptr += 2;
1259     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1260    
1261     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1262 ph10 411
1263 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1264     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1265     {
1266     int term;
1267     const uschar *thisname;
1268     *count += 1;
1269     if (name == NULL && *count == lorn) return *count;
1270     term = *ptr++;
1271     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1272     thisname = ptr;
1273     while (*ptr != term) ptr++;
1274     if (name != NULL && lorn == ptr - thisname &&
1275     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1276     return *count;
1277 ph10 461 term++;
1278 ph10 411 }
1279 ph10 408 }
1280 ph10 411 }
1281 ph10 408
1282 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1283 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1284     first-pass call when this value is temporarily adjusted to stop at the current
1285 ph10 578 position. So DO NOT change this to a test for binary zero. */
1286 ph10 408
1287 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1288 nigel 91 {
1289 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1290    
1291 ph10 391 if (*ptr == CHAR_BACKSLASH)
1292 nigel 93 {
1293 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1294 ph10 391 if (*ptr == CHAR_Q) for (;;)
1295 nigel 93 {
1296 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1297 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1298 ph10 391 if (*(++ptr) == CHAR_E) break;
1299 nigel 93 }
1300     continue;
1301     }
1302    
1303 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1304     are handled for real. If the first character is '^', skip it. Also, if the
1305     first few characters (either before or after ^) are \Q\E or \E we skip them
1306 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1307 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1308 nigel 93
1309 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1310 nigel 93 {
1311 ph10 340 BOOL negate_class = FALSE;
1312     for (;;)
1313     {
1314 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1315 ph10 340 {
1316 ph10 438 if (ptr[2] == CHAR_E)
1317     ptr+= 2;
1318     else if (strncmp((const char *)ptr+2,
1319 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1320 ph10 438 ptr += 4;
1321 ph10 392 else
1322 ph10 391 break;
1323 ph10 340 }
1324 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1325 ph10 461 {
1326 ph10 340 negate_class = TRUE;
1327 ph10 438 ptr++;
1328 ph10 461 }
1329 ph10 340 else break;
1330     }
1331    
1332     /* If the next character is ']', it is a data character that must be
1333 ph10 341 skipped, except in JavaScript compatibility mode. */
1334 ph10 345
1335 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1336 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1337 ph10 345 ptr++;
1338    
1339 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1340 nigel 93 {
1341 ph10 220 if (*ptr == 0) return -1;
1342 ph10 391 if (*ptr == CHAR_BACKSLASH)
1343 nigel 93 {
1344 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1345 ph10 391 if (*ptr == CHAR_Q) for (;;)
1346 nigel 93 {
1347 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1348 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1349 ph10 391 if (*(++ptr) == CHAR_E) break;
1350 nigel 93 }
1351     continue;
1352     }
1353     }
1354     continue;
1355     }
1356    
1357     /* Skip comments in /x mode */
1358    
1359 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1360 nigel 93 {
1361 ph10 579 ptr++;
1362 ph10 556 while (*ptr != 0)
1363     {
1364     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1365     ptr++;
1366 ph10 579 #ifdef SUPPORT_UTF8
1367 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1368     #endif
1369     }
1370 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1371 nigel 93 continue;
1372     }
1373    
1374 ph10 408 /* Check for the special metacharacters */
1375 ph10 411
1376 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1377 nigel 93 {
1378 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1379 ph10 408 if (rc > 0) return rc;
1380     if (*ptr == 0) goto FAIL_EXIT;
1381 nigel 93 }
1382 ph10 411
1383 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1384     {
1385     if (dup_parens && *count < hwm_count) *count = hwm_count;
1386 ph10 545 goto FAIL_EXIT;
1387 ph10 408 }
1388 ph10 411
1389     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1390 ph10 408 {
1391     if (*count > hwm_count) hwm_count = *count;
1392     *count = start_count;
1393 ph10 411 }
1394 ph10 408 }
1395 nigel 93
1396 ph10 408 FAIL_EXIT:
1397     *ptrptr = ptr;
1398     return -1;
1399     }
1400 nigel 93
1401    
1402    
1403    
1404 ph10 408 /*************************************************
1405     * Find forward referenced subpattern *
1406     *************************************************/
1407 nigel 93
1408 ph10 408 /* This function scans along a pattern's text looking for capturing
1409     subpatterns, and counting them. If it finds a named pattern that matches the
1410     name it is given, it returns its number. Alternatively, if the name is NULL, it
1411     returns when it reaches a given numbered subpattern. This is used for forward
1412     references to subpatterns. We used to be able to start this scan from the
1413     current compiling point, using the current count value from cd->bracount, and
1414     do it all in a single loop, but the addition of the possibility of duplicate
1415     subpattern numbers means that we have to scan from the very start, in order to
1416     take account of such duplicates, and to use a recursive function to keep track
1417     of the different types of group.
1418    
1419     Arguments:
1420     cd compile background data
1421     name name to seek, or NULL if seeking a numbered subpattern
1422     lorn name length, or subpattern number if name is NULL
1423     xmode TRUE if we are in /x mode
1424 ph10 579 utf8 TRUE if we are in UTF-8 mode
1425 ph10 408
1426     Returns: the number of the found subpattern, or -1 if not found
1427     */
1428    
1429     static int
1430 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1431     BOOL utf8)
1432 ph10 408 {
1433     uschar *ptr = (uschar *)cd->start_pattern;
1434     int count = 0;
1435     int rc;
1436    
1437     /* If the pattern does not start with an opening parenthesis, the first call
1438     to find_parens_sub() will scan right to the end (if necessary). However, if it
1439     does start with a parenthesis, find_parens_sub() will return when it hits the
1440     matching closing parens. That is why we have to have a loop. */
1441    
1442 ph10 411 for (;;)
1443     {
1444 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1445 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1446     }
1447    
1448 ph10 408 return rc;
1449 nigel 91 }
1450    
1451    
1452    
1453 ph10 408
1454 nigel 91 /*************************************************
1455 nigel 77 * Find first significant op code *
1456     *************************************************/
1457    
1458     /* This is called by several functions that scan a compiled expression looking
1459     for a fixed first character, or an anchoring op code etc. It skips over things
1460 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1461     forward and all backward assertions, and also the \b assertion; for others it
1462     does not.
1463 nigel 77
1464     Arguments:
1465     code pointer to the start of the group
1466     skipassert TRUE if certain assertions are to be skipped
1467    
1468     Returns: pointer to the first significant opcode
1469     */
1470    
1471     static const uschar*
1472 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1473 nigel 77 {
1474     for (;;)
1475     {
1476     switch ((int)*code)
1477     {
1478     case OP_ASSERT_NOT:
1479     case OP_ASSERTBACK:
1480     case OP_ASSERTBACK_NOT:
1481     if (!skipassert) return code;
1482     do code += GET(code, 1); while (*code == OP_ALT);
1483     code += _pcre_OP_lengths[*code];
1484     break;
1485    
1486     case OP_WORD_BOUNDARY:
1487     case OP_NOT_WORD_BOUNDARY:
1488     if (!skipassert) return code;
1489     /* Fall through */
1490    
1491     case OP_CALLOUT:
1492     case OP_CREF:
1493 ph10 459 case OP_NCREF:
1494 nigel 93 case OP_RREF:
1495 ph10 459 case OP_NRREF:
1496 nigel 93 case OP_DEF:
1497 nigel 77 code += _pcre_OP_lengths[*code];
1498     break;
1499    
1500     default:
1501     return code;
1502     }
1503     }
1504     /* Control never reaches here */
1505     }
1506    
1507    
1508    
1509    
1510     /*************************************************
1511 ph10 454 * Find the fixed length of a branch *
1512 nigel 77 *************************************************/
1513    
1514 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1515 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1516 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1517 ph10 454 temporarily terminated with OP_END when this function is called.
1518 nigel 77
1519 ph10 461 This function is called when a backward assertion is encountered, so that if it
1520     fails, the error message can point to the correct place in the pattern.
1521 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1522 ph10 461 because they can be forward references. We solve this by remembering this case
1523 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1524    
1525 nigel 77 Arguments:
1526     code points to the start of the pattern (the bracket)
1527 ph10 604 utf8 TRUE in UTF-8 mode
1528 ph10 461 atend TRUE if called when the pattern is complete
1529     cd the "compile data" structure
1530 nigel 77
1531 ph10 461 Returns: the fixed length,
1532 ph10 454 or -1 if there is no fixed length,
1533 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1534 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1535 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1536 nigel 77 */
1537    
1538     static int
1539 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1540 nigel 77 {
1541     int length = -1;
1542    
1543     register int branchlength = 0;
1544     register uschar *cc = code + 1 + LINK_SIZE;
1545    
1546     /* Scan along the opcodes for this branch. If we get to the end of the
1547     branch, check the length against that of the other branches. */
1548    
1549     for (;;)
1550     {
1551     int d;
1552 ph10 454 uschar *ce, *cs;
1553 nigel 77 register int op = *cc;
1554     switch (op)
1555     {
1556 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1557     OP_BRA (normal non-capturing bracket) because the other variants of these
1558     opcodes are all concerned with unlimited repeated groups, which of course
1559 ph10 747 are not of fixed length. */
1560 ph10 604
1561 nigel 93 case OP_CBRA:
1562 nigel 77 case OP_BRA:
1563     case OP_ONCE:
1564 ph10 733 case OP_ONCE_NC:
1565 nigel 77 case OP_COND:
1566 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1567 nigel 77 if (d < 0) return d;
1568     branchlength += d;
1569     do cc += GET(cc, 1); while (*cc == OP_ALT);
1570     cc += 1 + LINK_SIZE;
1571     break;
1572    
1573 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1574     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1575     an ALT. If it is END it's the end of the outer call. All can be handled by
1576     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1577     because they all imply an unlimited repeat. */
1578 nigel 77
1579     case OP_ALT:
1580     case OP_KET:
1581     case OP_END:
1582 ph10 747 case OP_ACCEPT:
1583     case OP_ASSERT_ACCEPT:
1584 nigel 77 if (length < 0) length = branchlength;
1585     else if (length != branchlength) return -1;
1586     if (*cc != OP_ALT) return length;
1587     cc += 1 + LINK_SIZE;
1588     branchlength = 0;
1589     break;
1590 ph10 461
1591 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1592     be OK. If the subroutine is a forward reference, we can't deal with
1593     it until the end of the pattern, so return -3. */
1594 ph10 461
1595 ph10 454 case OP_RECURSE:
1596     if (!atend) return -3;
1597     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1598     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1599     if (cc > cs && cc < ce) return -1; /* Recursion */
1600 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1601 ph10 461 if (d < 0) return d;
1602 ph10 454 branchlength += d;
1603     cc += 1 + LINK_SIZE;
1604 ph10 461 break;
1605 nigel 77
1606     /* Skip over assertive subpatterns */
1607    
1608     case OP_ASSERT:
1609     case OP_ASSERT_NOT:
1610     case OP_ASSERTBACK:
1611     case OP_ASSERTBACK_NOT:
1612     do cc += GET(cc, 1); while (*cc == OP_ALT);
1613     /* Fall through */
1614    
1615     /* Skip over things that don't match chars */
1616    
1617 ph10 747 case OP_MARK:
1618     case OP_PRUNE_ARG:
1619     case OP_SKIP_ARG:
1620     case OP_THEN_ARG:
1621     cc += cc[1] + _pcre_OP_lengths[*cc];
1622     break;
1623    
1624 nigel 77 case OP_CALLOUT:
1625     case OP_CIRC:
1626 ph10 602 case OP_CIRCM:
1627 ph10 747 case OP_CLOSE:
1628     case OP_COMMIT:
1629     case OP_CREF:
1630     case OP_DEF:
1631 nigel 77 case OP_DOLL:
1632 ph10 602 case OP_DOLLM:
1633 ph10 747 case OP_EOD:
1634     case OP_EODN:
1635     case OP_FAIL:
1636     case OP_NCREF:
1637     case OP_NRREF:
1638 nigel 77 case OP_NOT_WORD_BOUNDARY:
1639 ph10 747 case OP_PRUNE:
1640     case OP_REVERSE:
1641     case OP_RREF:
1642     case OP_SET_SOM:
1643     case OP_SKIP:
1644     case OP_SOD:
1645     case OP_SOM:
1646     case OP_THEN:
1647 nigel 77 case OP_WORD_BOUNDARY:
1648     cc += _pcre_OP_lengths[*cc];
1649     break;
1650    
1651     /* Handle literal characters */
1652    
1653     case OP_CHAR:
1654 ph10 602 case OP_CHARI:
1655 nigel 91 case OP_NOT:
1656 ph10 604 case OP_NOTI:
1657 nigel 77 branchlength++;
1658     cc += 2;
1659     #ifdef SUPPORT_UTF8
1660 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1661 nigel 77 #endif
1662     break;
1663    
1664     /* Handle exact repetitions. The count is already in characters, but we
1665     need to skip over a multibyte character in UTF8 mode. */
1666    
1667     case OP_EXACT:
1668 ph10 747 case OP_EXACTI:
1669     case OP_NOTEXACT:
1670     case OP_NOTEXACTI:
1671 nigel 77 branchlength += GET2(cc,1);
1672     cc += 4;
1673     #ifdef SUPPORT_UTF8
1674 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1675 nigel 77 #endif
1676     break;
1677    
1678     case OP_TYPEEXACT:
1679     branchlength += GET2(cc,1);
1680 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1681 nigel 77 cc += 4;
1682     break;
1683    
1684     /* Handle single-char matchers */
1685    
1686     case OP_PROP:
1687     case OP_NOTPROP:
1688 nigel 87 cc += 2;
1689 nigel 77 /* Fall through */
1690    
1691 ph10 747 case OP_HSPACE:
1692     case OP_VSPACE:
1693     case OP_NOT_HSPACE:
1694     case OP_NOT_VSPACE:
1695 nigel 77 case OP_NOT_DIGIT:
1696     case OP_DIGIT:
1697     case OP_NOT_WHITESPACE:
1698     case OP_WHITESPACE:
1699     case OP_NOT_WORDCHAR:
1700     case OP_WORDCHAR:
1701     case OP_ANY:
1702 ph10 342 case OP_ALLANY:
1703 nigel 77 branchlength++;
1704     cc++;
1705     break;
1706    
1707 ph10 754 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1708     otherwise \C is coded as OP_ALLANY. */
1709 nigel 77
1710     case OP_ANYBYTE:
1711     return -2;
1712    
1713     /* Check a class for variable quantification */
1714    
1715     #ifdef SUPPORT_UTF8
1716     case OP_XCLASS:
1717     cc += GET(cc, 1) - 33;
1718     /* Fall through */
1719     #endif
1720    
1721     case OP_CLASS:
1722     case OP_NCLASS:
1723     cc += 33;
1724    
1725     switch (*cc)
1726     {
1727 ph10 747 case OP_CRPLUS:
1728     case OP_CRMINPLUS:
1729 nigel 77 case OP_CRSTAR:
1730     case OP_CRMINSTAR:
1731     case OP_CRQUERY:
1732     case OP_CRMINQUERY:
1733     return -1;
1734    
1735     case OP_CRRANGE:
1736     case OP_CRMINRANGE:
1737     if (GET2(cc,1) != GET2(cc,3)) return -1;
1738     branchlength += GET2(cc,1);
1739     cc += 5;
1740     break;
1741    
1742     default:
1743     branchlength++;
1744     }
1745     break;
1746    
1747     /* Anything else is variable length */
1748    
1749 ph10 747 case OP_ANYNL:
1750     case OP_BRAMINZERO:
1751     case OP_BRAPOS:
1752     case OP_BRAPOSZERO:
1753     case OP_BRAZERO:
1754     case OP_CBRAPOS:
1755     case OP_EXTUNI:
1756     case OP_KETRMAX:
1757     case OP_KETRMIN:
1758     case OP_KETRPOS:
1759     case OP_MINPLUS:
1760     case OP_MINPLUSI:
1761     case OP_MINQUERY:
1762     case OP_MINQUERYI:
1763     case OP_MINSTAR:
1764     case OP_MINSTARI:
1765     case OP_MINUPTO:
1766     case OP_MINUPTOI:
1767     case OP_NOTMINPLUS:
1768     case OP_NOTMINPLUSI:
1769     case OP_NOTMINQUERY:
1770     case OP_NOTMINQUERYI:
1771     case OP_NOTMINSTAR:
1772     case OP_NOTMINSTARI:
1773     case OP_NOTMINUPTO:
1774     case OP_NOTMINUPTOI:
1775     case OP_NOTPLUS:
1776     case OP_NOTPLUSI:
1777     case OP_NOTPOSPLUS:
1778     case OP_NOTPOSPLUSI:
1779     case OP_NOTPOSQUERY:
1780     case OP_NOTPOSQUERYI:
1781     case OP_NOTPOSSTAR:
1782     case OP_NOTPOSSTARI:
1783     case OP_NOTPOSUPTO:
1784     case OP_NOTPOSUPTOI:
1785     case OP_NOTQUERY:
1786     case OP_NOTQUERYI:
1787     case OP_NOTSTAR:
1788     case OP_NOTSTARI:
1789     case OP_NOTUPTO:
1790     case OP_NOTUPTOI:
1791     case OP_PLUS:
1792     case OP_PLUSI:
1793     case OP_POSPLUS:
1794     case OP_POSPLUSI:
1795     case OP_POSQUERY:
1796     case OP_POSQUERYI:
1797     case OP_POSSTAR:
1798     case OP_POSSTARI:
1799     case OP_POSUPTO:
1800     case OP_POSUPTOI:
1801     case OP_QUERY:
1802     case OP_QUERYI:
1803     case OP_REF:
1804     case OP_REFI:
1805     case OP_SBRA:
1806     case OP_SBRAPOS:
1807     case OP_SCBRA:
1808     case OP_SCBRAPOS:
1809     case OP_SCOND:
1810     case OP_SKIPZERO:
1811     case OP_STAR:
1812     case OP_STARI:
1813     case OP_TYPEMINPLUS:
1814     case OP_TYPEMINQUERY:
1815     case OP_TYPEMINSTAR:
1816     case OP_TYPEMINUPTO:
1817     case OP_TYPEPLUS:
1818     case OP_TYPEPOSPLUS:
1819     case OP_TYPEPOSQUERY:
1820     case OP_TYPEPOSSTAR:
1821     case OP_TYPEPOSUPTO:
1822     case OP_TYPEQUERY:
1823     case OP_TYPESTAR:
1824     case OP_TYPEUPTO:
1825     case OP_UPTO:
1826     case OP_UPTOI:
1827     return -1;
1828    
1829     /* Catch unrecognized opcodes so that when new ones are added they
1830     are not forgotten, as has happened in the past. */
1831    
1832 nigel 77 default:
1833 ph10 747 return -4;
1834 nigel 77 }
1835     }
1836     /* Control never gets here */
1837     }
1838    
1839    
1840    
1841    
1842     /*************************************************
1843 ph10 454 * Scan compiled regex for specific bracket *
1844 nigel 77 *************************************************/
1845    
1846     /* This little function scans through a compiled pattern until it finds a
1847 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1848 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1849     so that it can be called from pcre_study() when finding the minimum matching
1850 ph10 455 length.
1851 nigel 77
1852     Arguments:
1853     code points to start of expression
1854     utf8 TRUE in UTF-8 mode
1855 ph10 454 number the required bracket number or negative to find a lookbehind
1856 nigel 77
1857     Returns: pointer to the opcode for the bracket, or NULL if not found
1858     */
1859    
1860 ph10 455 const uschar *
1861     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1862 nigel 77 {
1863     for (;;)
1864     {
1865     register int c = *code;
1866 ph10 618
1867 nigel 77 if (c == OP_END) return NULL;
1868 nigel 91
1869     /* XCLASS is used for classes that cannot be represented just by a bit
1870     map. This includes negated single high-valued characters. The length in
1871     the table is zero; the actual length is stored in the compiled code. */
1872    
1873     if (c == OP_XCLASS) code += GET(code, 1);
1874 ph10 461
1875 ph10 454 /* Handle recursion */
1876 ph10 461
1877 ph10 454 else if (c == OP_REVERSE)
1878     {
1879 ph10 461 if (number < 0) return (uschar *)code;
1880 ph10 454 code += _pcre_OP_lengths[c];
1881     }
1882 nigel 91
1883 nigel 93 /* Handle capturing bracket */
1884 nigel 91
1885 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1886     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1887 nigel 77 {
1888 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1889 nigel 77 if (n == number) return (uschar *)code;
1890 nigel 93 code += _pcre_OP_lengths[c];
1891 nigel 77 }
1892 nigel 91
1893 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1894     repeated character types, we have to test for \p and \P, which have an extra
1895 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1896 ph10 510 must add in its length. */
1897 nigel 91
1898 nigel 77 else
1899     {
1900 ph10 218 switch(c)
1901     {
1902     case OP_TYPESTAR:
1903     case OP_TYPEMINSTAR:
1904     case OP_TYPEPLUS:
1905     case OP_TYPEMINPLUS:
1906     case OP_TYPEQUERY:
1907     case OP_TYPEMINQUERY:
1908     case OP_TYPEPOSSTAR:
1909     case OP_TYPEPOSPLUS:
1910     case OP_TYPEPOSQUERY:
1911     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1912 ph10 220 break;
1913 ph10 221
1914     case OP_TYPEUPTO:
1915     case OP_TYPEMINUPTO:
1916     case OP_TYPEEXACT:
1917     case OP_TYPEPOSUPTO:
1918     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1919     break;
1920 ph10 512
1921 ph10 510 case OP_MARK:
1922     case OP_PRUNE_ARG:
1923     case OP_SKIP_ARG:
1924     code += code[1];
1925 ph10 512 break;
1926 ph10 550
1927     case OP_THEN_ARG:
1928 ph10 716 code += code[1];
1929 ph10 550 break;
1930 ph10 220 }
1931    
1932 ph10 218 /* Add in the fixed length from the table */
1933 ph10 220
1934 nigel 77 code += _pcre_OP_lengths[c];
1935 ph10 220
1936 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1937     a multi-byte character. The length in the table is a minimum, so we have to
1938     arrange to skip the extra bytes. */
1939 ph10 220
1940 ph10 107 #ifdef SUPPORT_UTF8
1941 nigel 77 if (utf8) switch(c)
1942     {
1943     case OP_CHAR:
1944 ph10 602 case OP_CHARI:
1945 nigel 77 case OP_EXACT:
1946 ph10 602 case OP_EXACTI:
1947 nigel 77 case OP_UPTO:
1948 ph10 602 case OP_UPTOI:
1949 nigel 77 case OP_MINUPTO:
1950 ph10 602 case OP_MINUPTOI:
1951 nigel 93 case OP_POSUPTO:
1952 ph10 602 case OP_POSUPTOI:
1953 nigel 77 case OP_STAR:
1954 ph10 602 case OP_STARI:
1955 nigel 77 case OP_MINSTAR:
1956 ph10 602 case OP_MINSTARI:
1957 nigel 93 case OP_POSSTAR:
1958 ph10 602 case OP_POSSTARI:
1959 nigel 77 case OP_PLUS:
1960 ph10 602 case OP_PLUSI:
1961 nigel 77 case OP_MINPLUS:
1962 ph10 602 case OP_MINPLUSI:
1963 nigel 93 case OP_POSPLUS:
1964 ph10 602 case OP_POSPLUSI:
1965 nigel 77 case OP_QUERY:
1966 ph10 602 case OP_QUERYI:
1967 nigel 77 case OP_MINQUERY:
1968 ph10 602 case OP_MINQUERYI:
1969 nigel 93 case OP_POSQUERY:
1970 ph10 602 case OP_POSQUERYI:
1971 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1972 nigel 77 break;
1973     }
1974 ph10 369 #else
1975     (void)(utf8); /* Keep compiler happy by referencing function argument */
1976 ph10 111 #endif
1977 nigel 77 }
1978     }
1979     }
1980    
1981    
1982    
1983     /*************************************************
1984     * Scan compiled regex for recursion reference *
1985     *************************************************/
1986    
1987     /* This little function scans through a compiled pattern until it finds an
1988     instance of OP_RECURSE.
1989    
1990     Arguments:
1991     code points to start of expression
1992     utf8 TRUE in UTF-8 mode
1993    
1994     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1995     */
1996    
1997     static const uschar *
1998     find_recurse(const uschar *code, BOOL utf8)
1999     {
2000     for (;;)
2001     {
2002     register int c = *code;
2003     if (c == OP_END) return NULL;
2004 nigel 91 if (c == OP_RECURSE) return code;
2005 ph10 220
2006 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2007     map. This includes negated single high-valued characters. The length in
2008     the table is zero; the actual length is stored in the compiled code. */
2009    
2010     if (c == OP_XCLASS) code += GET(code, 1);
2011    
2012 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2013     repeated character types, we have to test for \p and \P, which have an extra
2014 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2015 ph10 510 must add in its length. */
2016 nigel 91
2017 nigel 77 else
2018     {
2019 ph10 218 switch(c)
2020     {
2021     case OP_TYPESTAR:
2022     case OP_TYPEMINSTAR:
2023     case OP_TYPEPLUS:
2024     case OP_TYPEMINPLUS:
2025     case OP_TYPEQUERY:
2026     case OP_TYPEMINQUERY:
2027     case OP_TYPEPOSSTAR:
2028     case OP_TYPEPOSPLUS:
2029     case OP_TYPEPOSQUERY:
2030     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2031 ph10 220 break;
2032 ph10 221
2033     case OP_TYPEPOSUPTO:
2034     case OP_TYPEUPTO:
2035     case OP_TYPEMINUPTO:
2036     case OP_TYPEEXACT:
2037     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2038     break;
2039 ph10 512
2040 ph10 510 case OP_MARK:
2041     case OP_PRUNE_ARG:
2042     case OP_SKIP_ARG:
2043     code += code[1];
2044 ph10 512 break;
2045 ph10 550
2046     case OP_THEN_ARG:
2047 ph10 716 code += code[1];
2048 ph10 550 break;
2049 ph10 220 }
2050    
2051 ph10 218 /* Add in the fixed length from the table */
2052    
2053 nigel 77 code += _pcre_OP_lengths[c];
2054 ph10 220
2055 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2056     by a multi-byte character. The length in the table is a minimum, so we have
2057     to arrange to skip the extra bytes. */
2058 ph10 220
2059 ph10 107 #ifdef SUPPORT_UTF8
2060 nigel 77 if (utf8) switch(c)
2061     {
2062     case OP_CHAR:
2063 ph10 602 case OP_CHARI:
2064 nigel 77 case OP_EXACT:
2065 ph10 602 case OP_EXACTI:
2066 nigel 77 case OP_UPTO:
2067 ph10 602 case OP_UPTOI:
2068 nigel 77 case OP_MINUPTO:
2069 ph10 602 case OP_MINUPTOI:
2070 nigel 93 case OP_POSUPTO:
2071 ph10 602 case OP_POSUPTOI:
2072 nigel 77 case OP_STAR:
2073 ph10 602 case OP_STARI:
2074 nigel 77 case OP_MINSTAR:
2075 ph10 602 case OP_MINSTARI:
2076 nigel 93 case OP_POSSTAR:
2077 ph10 602 case OP_POSSTARI:
2078 nigel 77 case OP_PLUS:
2079 ph10 602 case OP_PLUSI:
2080 nigel 77 case OP_MINPLUS:
2081 ph10 602 case OP_MINPLUSI:
2082 nigel 93 case OP_POSPLUS:
2083 ph10 602 case OP_POSPLUSI:
2084 nigel 77 case OP_QUERY:
2085 ph10 602 case OP_QUERYI:
2086 nigel 77 case OP_MINQUERY:
2087 ph10 602 case OP_MINQUERYI:
2088 nigel 93 case OP_POSQUERY:
2089 ph10 602 case OP_POSQUERYI:
2090 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2091 nigel 77 break;
2092     }
2093 ph10 369 #else
2094     (void)(utf8); /* Keep compiler happy by referencing function argument */
2095 ph10 111 #endif
2096 nigel 77 }
2097     }
2098     }
2099    
2100    
2101    
2102     /*************************************************
2103     * Scan compiled branch for non-emptiness *
2104     *************************************************/
2105    
2106     /* This function scans through a branch of a compiled pattern to see whether it
2107 nigel 93 can match the empty string or not. It is called from could_be_empty()
2108     below and from compile_branch() when checking for an unlimited repeat of a
2109     group that can match nothing. Note that first_significant_code() skips over
2110 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2111     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2112     bracket whose current branch will already have been scanned.
2113 nigel 77
2114     Arguments:
2115     code points to start of search
2116     endcode points to where to stop
2117     utf8 TRUE if in UTF8 mode
2118 ph10 503 cd contains pointers to tables etc.
2119 nigel 77
2120     Returns: TRUE if what is matched could be empty
2121     */
2122    
2123     static BOOL
2124 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2125     compile_data *cd)
2126 nigel 77 {
2127     register int c;
2128 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2129 nigel 77 code < endcode;
2130 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2131 nigel 77 {
2132     const uschar *ccode;
2133    
2134     c = *code;
2135 ph10 507
2136 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2137 ph10 282 first_significant_code() with a TRUE final argument. */
2138 ph10 286
2139 ph10 282 if (c == OP_ASSERT)
2140 ph10 286 {
2141 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2142     c = *code;
2143     continue;
2144 ph10 286 }
2145 ph10 172
2146 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2147 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2148     forward reference subroutine call, we can't. To detect forward reference
2149 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2150     called only when doing the real compile, not during the pre-compile that
2151 ph10 624 measures the size of the compiled pattern. */
2152 ph10 507
2153 ph10 503 if (c == OP_RECURSE)
2154     {
2155 ph10 624 const uschar *scode;
2156     BOOL empty_branch;
2157 ph10 654
2158 ph10 624 /* Test for forward reference */
2159 ph10 654
2160 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2161 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2162 ph10 624
2163     /* Not a forward reference, test for completed backward reference */
2164 ph10 654
2165 ph10 624 empty_branch = FALSE;
2166     scode = cd->start_code + GET(code, 1);
2167 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2168 ph10 654
2169 ph10 624 /* Completed backwards reference */
2170 ph10 654
2171 ph10 503 do
2172     {
2173 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2174     {
2175     empty_branch = TRUE;
2176 ph10 507 break;
2177     }
2178 ph10 503 scode += GET(scode, 1);
2179     }
2180     while (*scode == OP_ALT);
2181 ph10 654
2182 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2183 ph10 503 continue;
2184 ph10 507 }
2185 ph10 170
2186 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2187    
2188     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2189     c == OP_BRAPOSZERO)
2190     {
2191     code += _pcre_OP_lengths[c];
2192     do code += GET(code, 1); while (*code == OP_ALT);
2193     c = *code;
2194     continue;
2195     }
2196    
2197     /* A nested group that is already marked as "could be empty" can just be
2198     skipped. */
2199    
2200     if (c == OP_SBRA || c == OP_SBRAPOS ||
2201     c == OP_SCBRA || c == OP_SCBRAPOS)
2202     {
2203     do code += GET(code, 1); while (*code == OP_ALT);
2204     c = *code;
2205     continue;
2206     }
2207    
2208 ph10 170 /* For other groups, scan the branches. */
2209 ph10 172
2210 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2211     c == OP_CBRA || c == OP_CBRAPOS ||
2212 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2213     c == OP_COND)
2214 nigel 77 {
2215     BOOL empty_branch;
2216     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2217 ph10 406
2218     /* If a conditional group has only one branch, there is a second, implied,
2219 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2220     Otherwise, scan the individual branches of the group. */
2221 ph10 406
2222 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2223 nigel 77 code += GET(code, 1);
2224 ph10 395 else
2225 ph10 406 {
2226 ph10 395 empty_branch = FALSE;
2227     do
2228     {
2229 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2230 ph10 395 empty_branch = TRUE;
2231     code += GET(code, 1);
2232     }
2233     while (*code == OP_ALT);
2234     if (!empty_branch) return FALSE; /* All branches are non-empty */
2235 nigel 77 }
2236 ph10 406
2237 ph10 172 c = *code;
2238 nigel 93 continue;
2239 nigel 77 }
2240    
2241 nigel 93 /* Handle the other opcodes */
2242    
2243     switch (c)
2244 nigel 77 {
2245 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2246     cannot be represented just by a bit map. This includes negated single
2247     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2248 ph10 220 actual length is stored in the compiled code, so we must update "code"
2249 ph10 216 here. */
2250 nigel 77
2251     #ifdef SUPPORT_UTF8
2252     case OP_XCLASS:
2253 ph10 216 ccode = code += GET(code, 1);
2254 nigel 77 goto CHECK_CLASS_REPEAT;
2255     #endif
2256    
2257     case OP_CLASS:
2258     case OP_NCLASS:
2259     ccode = code + 33;
2260    
2261     #ifdef SUPPORT_UTF8
2262     CHECK_CLASS_REPEAT:
2263     #endif
2264    
2265     switch (*ccode)
2266     {
2267     case OP_CRSTAR: /* These could be empty; continue */
2268     case OP_CRMINSTAR:
2269     case OP_CRQUERY:
2270     case OP_CRMINQUERY:
2271     break;
2272    
2273     default: /* Non-repeat => class must match */
2274     case OP_CRPLUS: /* These repeats aren't empty */
2275     case OP_CRMINPLUS:
2276     return FALSE;
2277    
2278     case OP_CRRANGE:
2279     case OP_CRMINRANGE:
2280     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2281     break;
2282     }
2283     break;
2284    
2285     /* Opcodes that must match a character */
2286    
2287     case OP_PROP:
2288     case OP_NOTPROP:
2289     case OP_EXTUNI:
2290     case OP_NOT_DIGIT:
2291     case OP_DIGIT:
2292     case OP_NOT_WHITESPACE:
2293     case OP_WHITESPACE:
2294     case OP_NOT_WORDCHAR:
2295     case OP_WORDCHAR:
2296     case OP_ANY:
2297 ph10 345 case OP_ALLANY:
2298 nigel 77 case OP_ANYBYTE:
2299     case OP_CHAR:
2300 ph10 602 case OP_CHARI:
2301 nigel 77 case OP_NOT:
2302 ph10 602 case OP_NOTI:
2303 nigel 77 case OP_PLUS:
2304     case OP_MINPLUS:
2305 nigel 93 case OP_POSPLUS:
2306 nigel 77 case OP_EXACT:
2307     case OP_NOTPLUS:
2308     case OP_NOTMINPLUS:
2309 nigel 93 case OP_NOTPOSPLUS:
2310 nigel 77 case OP_NOTEXACT:
2311     case OP_TYPEPLUS:
2312     case OP_TYPEMINPLUS:
2313 nigel 93 case OP_TYPEPOSPLUS:
2314 nigel 77 case OP_TYPEEXACT:
2315     return FALSE;
2316 ph10 227
2317     /* These are going to continue, as they may be empty, but we have to
2318     fudge the length for the \p and \P cases. */
2319    
2320 ph10 224 case OP_TYPESTAR:
2321     case OP_TYPEMINSTAR:
2322     case OP_TYPEPOSSTAR:
2323     case OP_TYPEQUERY:
2324     case OP_TYPEMINQUERY:
2325     case OP_TYPEPOSQUERY:
2326     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2327 ph10 227 break;
2328    
2329 ph10 224 /* Same for these */
2330 ph10 227
2331 ph10 224 case OP_TYPEUPTO:
2332     case OP_TYPEMINUPTO:
2333     case OP_TYPEPOSUPTO:
2334     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2335     break;
2336 nigel 77
2337     /* End of branch */
2338    
2339     case OP_KET:
2340     case OP_KETRMAX:
2341     case OP_KETRMIN:
2342 ph10 604 case OP_KETRPOS:
2343 nigel 77 case OP_ALT:
2344     return TRUE;
2345    
2346 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2347     MINUPTO, and POSUPTO may be followed by a multibyte character */
2348 nigel 77
2349     #ifdef SUPPORT_UTF8
2350     case OP_STAR:
2351 ph10 602 case OP_STARI:
2352 nigel 77 case OP_MINSTAR:
2353 ph10 602 case OP_MINSTARI:
2354 nigel 93 case OP_POSSTAR:
2355 ph10 602 case OP_POSSTARI:
2356 nigel 77 case OP_QUERY:
2357 ph10 602 case OP_QUERYI:
2358 nigel 77 case OP_MINQUERY:
2359 ph10 602 case OP_MINQUERYI:
2360 nigel 93 case OP_POSQUERY:
2361 ph10 602 case OP_POSQUERYI:
2362 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2363     break;
2364 ph10 461
2365 nigel 77 case OP_UPTO:
2366 ph10 602 case OP_UPTOI:
2367 nigel 77 case OP_MINUPTO:
2368 ph10 602 case OP_MINUPTOI:
2369 nigel 93 case OP_POSUPTO:
2370 ph10 602 case OP_POSUPTOI:
2371 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2372 nigel 77 break;
2373     #endif
2374 ph10 503
2375 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2376     string. */
2377    
2378     case OP_MARK:
2379     case OP_PRUNE_ARG:
2380     case OP_SKIP_ARG:
2381     code += code[1];
2382 ph10 512 break;
2383 ph10 510
2384 ph10 550 case OP_THEN_ARG:
2385 ph10 716 code += code[1];
2386 ph10 550 break;
2387    
2388 ph10 503 /* None of the remaining opcodes are required to match a character. */
2389 ph10 507
2390 ph10 503 default:
2391 ph10 507 break;
2392 nigel 77 }
2393     }
2394    
2395     return TRUE;
2396     }
2397    
2398    
2399    
2400     /*************************************************
2401     * Scan compiled regex for non-emptiness *
2402     *************************************************/
2403    
2404     /* This function is called to check for left recursive calls. We want to check
2405     the current branch of the current pattern to see if it could match the empty
2406     string. If it could, we must look outwards for branches at other levels,
2407     stopping when we pass beyond the bracket which is the subject of the recursion.
2408 ph10 654 This function is called only during the real compile, not during the
2409 ph10 624 pre-compile.
2410 nigel 77
2411     Arguments:
2412     code points to start of the recursion
2413     endcode points to where to stop (current RECURSE item)
2414     bcptr points to the chain of current (unclosed) branch starts
2415     utf8 TRUE if in UTF-8 mode
2416 ph10 507 cd pointers to tables etc
2417 nigel 77
2418     Returns: TRUE if what is matched could be empty
2419     */
2420    
2421     static BOOL
2422     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2423 ph10 503 BOOL utf8, compile_data *cd)
2424 nigel 77 {
2425 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2426 nigel 77 {
2427 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2428 ph10 475 return FALSE;
2429 nigel 77 bcptr = bcptr->outer;
2430     }
2431     return TRUE;
2432     }
2433    
2434    
2435    
2436     /*************************************************
2437     * Check for POSIX class syntax *
2438     *************************************************/
2439    
2440     /* This function is called when the sequence "[:" or "[." or "[=" is
2441 ph10 295 encountered in a character class. It checks whether this is followed by a
2442 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2443 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2444 nigel 77
2445 ph10 298 Originally, this function only recognized a sequence of letters between the
2446     terminators, but it seems that Perl recognizes any sequence of characters,
2447     though of course unknown POSIX names are subsequently rejected. Perl gives an
2448     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2449     didn't consider this to be a POSIX class. Likewise for [:1234:].
2450 ph10 295
2451 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2452     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2453     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2454     below handles the special case of \], but does not try to do any other escape
2455     processing. This makes it different from Perl for cases such as [:l\ower:]
2456 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2457 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2458 ph10 295 I think.
2459    
2460 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2461     It seems that the appearance of a nested POSIX class supersedes an apparent
2462     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2463 ph10 691 a digit.
2464 ph10 640
2465 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2466     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2467     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2468 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2469 ph10 661 names.
2470    
2471 ph10 295 Arguments:
2472 nigel 77 ptr pointer to the initial [
2473     endptr where to return the end pointer
2474    
2475     Returns: TRUE or FALSE
2476     */
2477    
2478     static BOOL
2479 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2480 nigel 77 {
2481     int terminator; /* Don't combine these lines; the Solaris cc */
2482     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2483 ph10 295 for (++ptr; *ptr != 0; ptr++)
2484 nigel 77 {
2485 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2486     ptr++;
2487 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2488 ph10 640 else
2489 ph10 298 {
2490 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2491 ph10 295 {
2492     *endptr = ptr;
2493     return TRUE;
2494 ph10 298 }
2495 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2496     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2497     ptr[1] == CHAR_EQUALS_SIGN) &&
2498     check_posix_syntax(ptr, endptr))
2499 ph10 654 return FALSE;
2500 ph10 298 }
2501     }
2502 nigel 77 return FALSE;
2503     }
2504    
2505    
2506    
2507    
2508     /*************************************************
2509     * Check POSIX class name *
2510     *************************************************/
2511    
2512     /* This function is called to check the name given in a POSIX-style class entry
2513     such as [:alnum:].
2514    
2515     Arguments:
2516     ptr points to the first letter
2517     len the length of the name
2518    
2519     Returns: a value representing the name, or -1 if unknown
2520     */
2521    
2522     static int
2523     check_posix_name(const uschar *ptr, int len)
2524     {
2525 ph10 240 const char *pn = posix_names;
2526 nigel 77 register int yield = 0;
2527     while (posix_name_lengths[yield] != 0)
2528     {
2529     if (len == posix_name_lengths[yield] &&
2530 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2531 ph10 243 pn += posix_name_lengths[yield] + 1;
2532 nigel 77 yield++;
2533     }
2534     return -1;
2535     }
2536    
2537    
2538     /*************************************************
2539     * Adjust OP_RECURSE items in repeated group *
2540     *************************************************/
2541    
2542     /* OP_RECURSE items contain an offset from the start of the regex to the group
2543     that is referenced. This means that groups can be replicated for fixed
2544     repetition simply by copying (because the recursion is allowed to refer to
2545     earlier groups that are outside the current group). However, when a group is
2546 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2547     inserted before it, after it has been compiled. This means that any OP_RECURSE
2548     items within it that refer to the group itself or any contained groups have to
2549     have their offsets adjusted. That one of the jobs of this function. Before it
2550     is called, the partially compiled regex must be temporarily terminated with
2551     OP_END.
2552 nigel 77
2553 nigel 93 This function has been extended with the possibility of forward references for
2554     recursions and subroutine calls. It must also check the list of such references
2555     for the group we are dealing with. If it finds that one of the recursions in
2556     the current group is on this list, it adjusts the offset in the list, not the
2557     value in the reference (which is a group number).
2558    
2559 nigel 77 Arguments:
2560     group points to the start of the group
2561     adjust the amount by which the group is to be moved
2562     utf8 TRUE in UTF-8 mode
2563     cd contains pointers to tables etc.
2564 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2565 nigel 77
2566     Returns: nothing
2567     */
2568    
2569     static void
2570 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2571     uschar *save_hwm)
2572 nigel 77 {
2573     uschar *ptr = group;
2574 ph10 224
2575 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2576     {
2577 nigel 93 int offset;
2578     uschar *hc;
2579    
2580     /* See if this recursion is on the forward reference list. If so, adjust the
2581     reference. */
2582 ph10 345
2583 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2584     {
2585     offset = GET(hc, 0);
2586     if (cd->start_code + offset == ptr + 1)
2587     {
2588     PUT(hc, 0, offset + adjust);
2589     break;
2590     }
2591     }
2592    
2593     /* Otherwise, adjust the recursion offset if it's after the start of this
2594     group. */
2595    
2596     if (hc >= cd->hwm)
2597     {
2598     offset = GET(ptr, 1);
2599     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2600     }
2601    
2602 nigel 77 ptr += 1 + LINK_SIZE;
2603     }
2604     }
2605    
2606    
2607    
2608     /*************************************************
2609     * Insert an automatic callout point *
2610     *************************************************/
2611    
2612     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2613     callout points before each pattern item.
2614    
2615     Arguments:
2616     code current code pointer
2617     ptr current pattern pointer
2618     cd pointers to tables etc
2619    
2620     Returns: new code pointer
2621     */
2622    
2623     static uschar *
2624     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2625     {
2626     *code++ = OP_CALLOUT;
2627     *code++ = 255;
2628 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2629     PUT(code, LINK_SIZE, 0); /* Default length */
2630 nigel 77 return code + 2*LINK_SIZE;
2631     }
2632    
2633    
2634    
2635     /*************************************************
2636     * Complete a callout item *
2637     *************************************************/
2638    
2639     /* A callout item contains the length of the next item in the pattern, which
2640     we can't fill in till after we have reached the relevant point. This is used
2641     for both automatic and manual callouts.
2642    
2643     Arguments:
2644     previous_callout points to previous callout item
2645     ptr current pattern pointer
2646     cd pointers to tables etc
2647    
2648     Returns: nothing
2649     */
2650    
2651     static void
2652     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2653     {
2654 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2655 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2656     }
2657    
2658    
2659    
2660     #ifdef SUPPORT_UCP
2661     /*************************************************
2662     * Get othercase range *
2663     *************************************************/
2664    
2665     /* This function is passed the start and end of a class range, in UTF-8 mode
2666     with UCP support. It searches up the characters, looking for internal ranges of
2667     characters in the "other" case. Each call returns the next one, updating the
2668     start address.
2669    
2670     Arguments:
2671     cptr points to starting character value; updated
2672     d end value
2673     ocptr where to put start of othercase range
2674     odptr where to put end of othercase range
2675    
2676     Yield: TRUE when range returned; FALSE when no more
2677     */
2678    
2679     static BOOL
2680 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2681     unsigned int *odptr)
2682 nigel 77 {
2683 nigel 93 unsigned int c, othercase, next;
2684 nigel 77
2685     for (c = *cptr; c <= d; c++)
2686 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2687 nigel 77
2688     if (c > d) return FALSE;
2689    
2690     *ocptr = othercase;
2691     next = othercase + 1;
2692    
2693     for (++c; c <= d; c++)
2694     {
2695 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2696 nigel 77 next++;
2697     }
2698    
2699     *odptr = next - 1;
2700     *cptr = c;
2701    
2702     return TRUE;
2703     }
2704 ph10 532
2705    
2706    
2707     /*************************************************
2708     * Check a character and a property *
2709     *************************************************/
2710    
2711     /* This function is called by check_auto_possessive() when a property item
2712     is adjacent to a fixed character.
2713    
2714     Arguments:
2715     c the character
2716     ptype the property type
2717     pdata the data for the type
2718     negated TRUE if it's a negated property (\P or \p{^)
2719 ph10 535
2720 ph10 532 Returns: TRUE if auto-possessifying is OK
2721 ph10 535 */
2722 ph10 532
2723     static BOOL
2724     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2725     {
2726     const ucd_record *prop = GET_UCD(c);
2727     switch(ptype)
2728     {
2729     case PT_LAMP:
2730     return (prop->chartype == ucp_Lu ||
2731     prop->chartype == ucp_Ll ||
2732     prop->chartype == ucp_Lt) == negated;
2733    
2734     case PT_GC:
2735     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2736    
2737     case PT_PC:
2738     return (pdata == prop->chartype) == negated;
2739    
2740     case PT_SC:
2741     return (pdata == prop->script) == negated;
2742    
2743     /* These are specials */
2744    
2745     case PT_ALNUM:
2746     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2747     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2748    
2749     case PT_SPACE: /* Perl space */
2750     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2751     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2752     == negated;
2753    
2754     case PT_PXSPACE: /* POSIX space */
2755     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2756     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2757     c == CHAR_FF || c == CHAR_CR)
2758     == negated;
2759    
2760     case PT_WORD:
2761     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2762     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2763     c == CHAR_UNDERSCORE) == negated;
2764     }
2765 ph10 535 return FALSE;
2766 ph10 532 }
2767 nigel 77 #endif /* SUPPORT_UCP */
2768    
2769    
2770 nigel 93
2771 nigel 77 /*************************************************
2772 nigel 93 * Check if auto-possessifying is possible *
2773     *************************************************/
2774    
2775     /* This function is called for unlimited repeats of certain items, to see
2776     whether the next thing could possibly match the repeated item. If not, it makes
2777     sense to automatically possessify the repeated item.
2778    
2779     Arguments:
2780 ph10 532 previous pointer to the repeated opcode
2781 nigel 93 utf8 TRUE in UTF-8 mode
2782     ptr next character in pattern
2783     options options bits
2784     cd contains pointers to tables etc.
2785    
2786     Returns: TRUE if possessifying is wanted
2787     */
2788    
2789     static BOOL
2790 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2791 ph10 532 int options, compile_data *cd)
2792 nigel 93 {
2793 ph10 532 int c, next;
2794     int op_code = *previous++;
2795 nigel 93
2796     /* Skip whitespace and comments in extended mode */
2797    
2798     if ((options & PCRE_EXTENDED) != 0)
2799     {
2800     for (;;)
2801     {
2802     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2803 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2804 nigel 93 {
2805 ph10 579 ptr++;
2806 ph10 556 while (*ptr != 0)
2807     {
2808 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2809 ph10 556 ptr++;
2810 ph10 579 #ifdef SUPPORT_UTF8
2811 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2812     #endif
2813     }
2814 nigel 93 }
2815     else break;
2816     }
2817     }
2818    
2819     /* If the next item is one that we can handle, get its value. A non-negative
2820     value is a character, a negative value is an escape value. */
2821    
2822 ph10 391 if (*ptr == CHAR_BACKSLASH)
2823 nigel 93 {
2824     int temperrorcode = 0;
2825     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2826     if (temperrorcode != 0) return FALSE;
2827     ptr++; /* Point after the escape sequence */
2828     }
2829    
2830     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2831     {
2832     #ifdef SUPPORT_UTF8
2833     if (utf8) { GETCHARINC(next, ptr); } else
2834     #endif
2835     next = *ptr++;
2836     }
2837    
2838     else return FALSE;
2839    
2840     /* Skip whitespace and comments in extended mode */
2841    
2842     if ((options & PCRE_EXTENDED) != 0)
2843     {
2844     for (;;)
2845     {
2846     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2847 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2848 nigel 93 {
2849 ph10 579 ptr++;
2850 ph10 556 while (*ptr != 0)
2851     {
2852 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2853 ph10 556 ptr++;
2854 ph10 579 #ifdef SUPPORT_UTF8
2855 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2856     #endif
2857     }
2858 nigel 93 }
2859     else break;
2860     }
2861     }
2862    
2863     /* If the next thing is itself optional, we have to give up. */
2864    
2865 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2866 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2867     return FALSE;
2868 nigel 93
2869 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2870     the next item is a character. */
2871 nigel 93
2872     if (next >= 0) switch(op_code)
2873     {
2874     case OP_CHAR:
2875 ph10 535 #ifdef SUPPORT_UTF8
2876 ph10 532 GETCHARTEST(c, previous);
2877 ph10 369 #else
2878 ph10 532 c = *previous;
2879 ph10 535 #endif
2880     return c != next;
2881 nigel 93
2882 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2883 nigel 93 Unicode property support, we can use it to test the other case of
2884     high-valued characters. */
2885    
2886 ph10 602 case OP_CHARI:
2887 ph10 535 #ifdef SUPPORT_UTF8
2888 ph10 532 GETCHARTEST(c, previous);
2889     #else
2890     c = *previous;
2891 ph10 535 #endif
2892 ph10 532 if (c == next) return FALSE;
2893 nigel 93 #ifdef SUPPORT_UTF8
2894     if (utf8)
2895     {
2896     unsigned int othercase;
2897     if (next < 128) othercase = cd->fcc[next]; else
2898     #ifdef SUPPORT_UCP
2899 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2900 nigel 93 #else
2901     othercase = NOTACHAR;
2902     #endif
2903 ph10 532 return (unsigned int)c != othercase;
2904 nigel 93 }
2905     else
2906     #endif /* SUPPORT_UTF8 */
2907 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2908 nigel 93
2909 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2910 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2911 ph10 602 an XCLASS instead. */
2912 nigel 93
2913     case OP_NOT:
2914 ph10 602 return (c = *previous) == next;
2915 ph10 604
2916     case OP_NOTI:
2917 ph10 532 if ((c = *previous) == next) return TRUE;
2918 nigel 93 #ifdef SUPPORT_UTF8
2919     if (utf8)
2920     {
2921     unsigned int othercase;
2922     if (next < 128) othercase = cd->fcc[next]; else
2923     #ifdef SUPPORT_UCP
2924 ph10 349 othercase = UCD_OTHERCASE(next);
2925 nigel 93 #else
2926     othercase = NOTACHAR;
2927     #endif
2928 ph10 532 return (unsigned int)c == othercase;
2929 nigel 93 }
2930     else
2931     #endif /* SUPPORT_UTF8 */
2932 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2933 nigel 93
2934 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2935     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2936    
2937 nigel 93 case OP_DIGIT:
2938     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2939    
2940     case OP_NOT_DIGIT:
2941     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2942    
2943     case OP_WHITESPACE:
2944     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2945    
2946     case OP_NOT_WHITESPACE:
2947     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2948    
2949     case OP_WORDCHAR:
2950     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2951    
2952     case OP_NOT_WORDCHAR:
2953     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2954    
2955 ph10 180 case OP_HSPACE:
2956     case OP_NOT_HSPACE:
2957     switch(next)
2958     {
2959     case 0x09:
2960     case 0x20:
2961     case 0xa0:
2962     case 0x1680:
2963     case 0x180e:
2964     case 0x2000:
2965     case 0x2001:
2966     case 0x2002:
2967     case 0x2003:
2968     case 0x2004:
2969     case 0x2005:
2970     case 0x2006:
2971     case 0x2007:
2972     case 0x2008:
2973     case 0x2009:
2974     case 0x200A:
2975     case 0x202f:
2976     case 0x205f:
2977     case 0x3000:
2978 ph10 528 return op_code == OP_NOT_HSPACE;
2979 ph10 180 default:
2980 ph10 528 return op_code != OP_NOT_HSPACE;
2981 ph10 180 }
2982    
2983 ph10 528 case OP_ANYNL:
2984 ph10 180 case OP_VSPACE:
2985     case OP_NOT_VSPACE:
2986     switch(next)
2987     {
2988     case 0x0a:
2989     case 0x0b:
2990     case 0x0c:
2991     case 0x0d:
2992     case 0x85:
2993     case 0x2028:
2994     case 0x2029:
2995 ph10 528 return op_code == OP_NOT_VSPACE;
2996 ph10 180 default:
2997 ph10 528 return op_code != OP_NOT_VSPACE;
2998 ph10 180 }
2999    
3000 ph10 532 #ifdef SUPPORT_UCP
3001     case OP_PROP:
3002     return check_char_prop(next, previous[0], previous[1], FALSE);
3003 ph10 535
3004 ph10 532 case OP_NOTPROP:
3005     return check_char_prop(next, previous[0], previous[1], TRUE);
3006     #endif
3007    
3008 nigel 93 default:
3009     return FALSE;
3010     }
3011    
3012    
3013 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3014     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3015     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3016     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3017 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3018 nigel 93
3019     switch(op_code)
3020     {
3021     case OP_CHAR:
3022 ph10 602 case OP_CHARI:
3023 ph10 535 #ifdef SUPPORT_UTF8
3024 ph10 532 GETCHARTEST(c, previous);
3025     #else
3026     c = *previous;
3027 ph10 535 #endif
3028 nigel 93 switch(-next)
3029     {
3030     case ESC_d:
3031 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3032 nigel 93
3033     case ESC_D:
3034 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3035 nigel 93
3036     case ESC_s:
3037 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3038 nigel 93
3039     case ESC_S:
3040 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3041 nigel 93
3042     case ESC_w:
3043 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3044 nigel 93
3045     case ESC_W:
3046 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3047 ph10 182
3048 ph10 180 case ESC_h:
3049     case ESC_H:
3050 ph10 532 switch(c)
3051 ph10 180 {
3052     case 0x09:
3053     case 0x20:
3054     case 0xa0:
3055     case 0x1680:
3056     case 0x180e:
3057     case 0x2000:
3058     case 0x2001:
3059     case 0x2002:
3060     case 0x2003:
3061     case 0x2004:
3062     case 0x2005:
3063     case 0x2006:
3064     case 0x2007:
3065     case 0x2008:
3066     case 0x2009:
3067     case 0x200A:
3068     case 0x202f:
3069     case 0x205f:
3070     case 0x3000:
3071     return -next != ESC_h;
3072     default:
3073     return -next == ESC_h;
3074 ph10 182 }
3075    
3076 ph10 180 case ESC_v:
3077     case ESC_V:
3078 ph10 532 switch(c)
3079 ph10 180 {
3080     case 0x0a:
3081     case 0x0b:
3082     case 0x0c:
3083     case 0x0d:
3084     case 0x85:
3085     case 0x2028:
3086     case 0x2029:
3087     return -next != ESC_v;
3088     default:
3089     return -next == ESC_v;
3090 ph10 182 }
3091 ph10 535
3092     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3093     their substitutions and process them. The result will always be either
3094 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3095 ph10 535
3096 ph10 532 #ifdef SUPPORT_UCP
3097     case ESC_du:
3098     case ESC_DU:
3099     case ESC_wu:
3100     case ESC_WU:
3101     case ESC_su:
3102     case ESC_SU:
3103     {
3104     int temperrorcode = 0;
3105     ptr = substitutes[-next - ESC_DU];
3106     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3107     if (temperrorcode != 0) return FALSE;
3108     ptr++; /* For compatibility */
3109     }
3110 ph10 535 /* Fall through */
3111 nigel 93
3112 ph10 532 case ESC_p:
3113     case ESC_P:
3114     {
3115     int ptype, pdata, errorcodeptr;
3116 ph10 535 BOOL negated;
3117    
3118 ph10 532 ptr--; /* Make ptr point at the p or P */
3119     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3120     if (ptype < 0) return FALSE;
3121     ptr++; /* Point past the final curly ket */
3122 ph10 535
3123 ph10 532 /* If the property item is optional, we have to give up. (When generated
3124     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3125     to the original \d etc. At this point, ptr will point to a zero byte. */
3126 ph10 535
3127 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3128     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3129     return FALSE;
3130 ph10 535
3131 ph10 532 /* Do the property check. */
3132 ph10 535
3133 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3134 ph10 535 }
3135 ph10 532 #endif
3136    
3137 nigel 93 default:
3138     return FALSE;
3139     }
3140    
3141 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3142     well. It means re-organizing the above code so as to get hold of the property
3143     values before switching on the op-code. However, I wonder how many patterns
3144     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3145     these op-codes are never generated.) */
3146    
3147 nigel 93 case OP_DIGIT:
3148 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3149 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3150 nigel 93
3151     case OP_NOT_DIGIT:
3152     return next == -ESC_d;
3153    
3154     case OP_WHITESPACE:
3155 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3156 nigel 93
3157     case OP_NOT_WHITESPACE:
3158 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3159 nigel 93
3160 ph10 180 case OP_HSPACE:
3161 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3162 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3163 ph10 180
3164     case OP_NOT_HSPACE:
3165     return next == -ESC_h;
3166 ph10 182
3167 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3168 ph10 535 case OP_ANYNL:
3169 ph10 182 case OP_VSPACE:
3170 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3171    
3172     case OP_NOT_VSPACE:
3173 ph10 528 return next == -ESC_v || next == -ESC_R;
3174 ph10 180
3175 nigel 93 case OP_WORDCHAR:
3176 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3177 ph10 528 next == -ESC_v || next == -ESC_R;
3178 nigel 93
3179     case OP_NOT_WORDCHAR:
3180     return next == -ESC_w || next == -ESC_d;
3181 ph10 182
3182 nigel 93 default:
3183     return FALSE;
3184     }
3185    
3186     /* Control does not reach here */
3187     }
3188    
3189    
3190    
3191     /*************************************************
3192 nigel 77 * Compile one branch *
3193     *************************************************/
3194    
3195 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3196 nigel 77 changed during the branch, the pointer is used to change the external options
3197 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3198     to find out the amount of memory needed, as well as during the real compile
3199     phase. The value of lengthptr distinguishes the two phases.
3200 nigel 77
3201     Arguments:
3202     optionsptr pointer to the option bits
3203     codeptr points to the pointer to the current code point
3204     ptrptr points to the current pattern pointer
3205     errorcodeptr points to error code variable
3206     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3207     reqbyteptr set to the last literal character required, else < 0
3208     bcptr points to current branch chain
3209 ph10 654 cond_depth conditional nesting depth
3210 nigel 77 cd contains pointers to tables etc.
3211 nigel 93 lengthptr NULL during the real compile phase
3212     points to length accumulator during pre-compile phase
3213 nigel 77
3214     Returns: TRUE on success
3215     FALSE, with *errorcodeptr set non-zero on error
3216     */
3217    
3218     static BOOL
3219 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3220     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3221 ph10 642 int cond_depth, compile_data *cd, int *lengthptr)
3222 nigel 77 {
3223     int repeat_type, op_type;
3224     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3225     int bravalue = 0;
3226     int greedy_default, greedy_non_default;
3227     int firstbyte, reqbyte;
3228     int zeroreqbyte, zerofirstbyte;
3229     int req_caseopt, reqvary, tempreqvary;
3230 ph10 635 int options = *optionsptr; /* May change dynamically */
3231 nigel 77 int after_manual_callout = 0;
3232 nigel 93 int length_prevgroup = 0;
3233 nigel 77 register int c;
3234     register uschar *code = *codeptr;
3235 nigel 93 uschar *last_code = code;
3236     uschar *orig_code = code;
3237 nigel 77 uschar *tempcode;
3238     BOOL inescq = FALSE;
3239     BOOL groupsetfirstbyte = FALSE;
3240     const uschar *ptr = *ptrptr;
3241     const uschar *tempptr;
3242 ph10 518 const uschar *nestptr = NULL;
3243 nigel 77 uschar *previous = NULL;
3244     uschar *previous_callout = NULL;
3245 nigel 93 uschar *save_hwm = NULL;
3246 nigel 77 uschar classbits[32];
3247    
3248 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3249 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3250 ph10 635 dynamically as we process the pattern. */
3251    
3252 nigel 77 #ifdef SUPPORT_UTF8
3253     BOOL class_utf8;
3254     BOOL utf8 = (options & PCRE_UTF8) != 0;
3255     uschar *class_utf8data;
3256 ph10 300 uschar *class_utf8data_base;
3257 nigel 77 uschar utf8_char[6];
3258     #else
3259     BOOL utf8 = FALSE;
3260     #endif
3261    
3262 ph10 475 #ifdef PCRE_DEBUG
3263 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3264     #endif
3265    
3266 nigel 77 /* Set up the default and non-default settings for greediness */
3267    
3268     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3269     greedy_non_default = greedy_default ^ 1;
3270    
3271     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3272     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3273     matches a non-fixed char first char; reqbyte just remains unset if we never
3274     find one.
3275    
3276     When we hit a repeat whose minimum is zero, we may have to adjust these values
3277     to take the zero repeat into account. This is implemented by setting them to
3278     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3279     item types that can be repeated set these backoff variables appropriately. */
3280    
3281     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3282    
3283     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3284     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3285     value > 255. It is added into the firstbyte or reqbyte variables to record the
3286     case status of the value. This is used only for ASCII characters. */
3287    
3288     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3289    
3290     /* Switch on next character until the end of the branch */
3291    
3292     for (;; ptr++)
3293     {
3294     BOOL negate_class;
3295 ph10 286 BOOL should_flip_negation;
3296 nigel 77 BOOL possessive_quantifier;
3297     BOOL is_quantifier;
3298 nigel 93 BOOL is_recurse;
3299 ph10 180 BOOL reset_bracount;
3300 nigel 77 int class_charcount;
3301     int class_lastchar;
3302     int newoptions;
3303     int recno;
3304 ph10 172 int refsign;
3305 nigel 77 int skipbytes;
3306     int subreqbyte;
3307     int subfirstbyte;
3308 nigel 93 int terminator;
3309 nigel 77 int mclength;
3310 ph10 733 int tempbracount;
3311 nigel 77 uschar mcbuffer[8];
3312    
3313 nigel 93 /* Get next byte in the pattern */
3314 nigel 77
3315     c = *ptr;
3316 ph10 345
3317 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3318 ph10 518 string. Nesting only happens one level deep. */
3319    
3320     if (c == 0 && nestptr != NULL)
3321     {
3322     ptr = nestptr;
3323     nestptr = NULL;
3324     c = *ptr;
3325     }
3326    
3327 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3328     previous cycle of this loop. */
3329    
3330     if (lengthptr != NULL)
3331     {
3332 ph10 475 #ifdef PCRE_DEBUG
3333 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3334     #endif
3335 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3336 nigel 93 {
3337     *errorcodeptr = ERR52;
3338     goto FAILED;
3339     }
3340    
3341     /* There is at least one situation where code goes backwards: this is the
3342     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3343     the class is simply eliminated. However, it is created first, so we have to
3344     allow memory for it. Therefore, don't ever reduce the length at this point.
3345     */
3346    
3347     if (code < last_code) code = last_code;
3348 ph10 202
3349     /* Paranoid check for integer overflow */
3350    
3351     if (OFLOW_MAX - *lengthptr < code - last_code)
3352     {
3353     *errorcodeptr = ERR20;
3354     goto FAILED;
3355     }
3356    
3357 ph10 530 *lengthptr += (int)(code - last_code);
3358 ph10 751 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3359     c));
3360 nigel 93
3361     /* If "previous" is set and it is not at the start of the work space, move
3362     it back to there, in order to avoid filling up the work space. Otherwise,
3363     if "previous" is NULL, reset the current code pointer to the start. */
3364    
3365     if (previous != NULL)
3366     {
3367     if (previous > orig_code)
3368     {
3369     memmove(orig_code, previous, code - previous);
3370     code -= previous - orig_code;
3371     previous = orig_code;
3372     }
3373     }
3374     else code = orig_code;
3375    
3376     /* Remember where this code item starts so we can pick up the length
3377     next time round. */
3378    
3379     last_code = code;
3380     }
3381    
3382     /* In the real compile phase, just check the workspace used by the forward
3383     reference list. */
3384    
3385 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3386 nigel 93 {
3387     *errorcodeptr = ERR52;
3388     goto FAILED;
3389     }
3390    
3391 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3392    
3393     if (inescq && c != 0)
3394     {
3395 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3396 nigel 77 {
3397     inescq = FALSE;
3398     ptr++;
3399     continue;
3400     }
3401     else
3402     {
3403     if (previous_callout != NULL)
3404     {
3405 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3406     complete_callout(previous_callout, ptr, cd);
3407 nigel 77 previous_callout = NULL;
3408     }
3409     if ((options & PCRE_AUTO_CALLOUT) != 0)
3410     {
3411     previous_callout = code;
3412     code = auto_callout(code, ptr, cd);
3413     }
3414     goto NORMAL_CHAR;
3415     }
3416     }
3417    
3418     /* Fill in length of a previous callout, except when the next thing is
3419     a quantifier. */
3420    
3421 ph10 392 is_quantifier =
3422 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3423     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3424 nigel 77
3425     if (!is_quantifier && previous_callout != NULL &&
3426     after_manual_callout-- <= 0)
3427     {
3428 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3429     complete_callout(previous_callout, ptr, cd);
3430 nigel 77 previous_callout = NULL;
3431     }
3432    
3433 ph10 635 /* In extended mode, skip white space and comments. */
3434 nigel 77
3435     if ((options & PCRE_EXTENDED) != 0)
3436     {
3437     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3438 ph10 391 if (c == CHAR_NUMBER_SIGN)
3439 nigel 77 {
3440 ph10 579 ptr++;
3441 ph10 556 while (*ptr != 0)
3442 nigel 91 {
3443 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3444 ph10 556 ptr++;
3445 ph10 579 #ifdef SUPPORT_UTF8
3446 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3447     #endif
3448 nigel 91 }
3449 nigel 93 if (*ptr != 0) continue;
3450    
3451 nigel 91 /* Else fall through to handle end of string */
3452     c = 0;
3453 nigel 77 }
3454     }
3455    
3456     /* No auto callout for quantifiers. */
3457    
3458     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3459     {
3460     previous_callout = code;
3461     code = auto_callout(code, ptr, cd);
3462     }
3463    
3464     switch(c)
3465     {
3466 nigel 93 /* ===================================================================*/
3467     case 0: /* The branch terminates at string end */
3468 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3469     case CHAR_RIGHT_PARENTHESIS:
3470 nigel 77 *firstbyteptr = firstbyte;
3471     *reqbyteptr = reqbyte;
3472     *codeptr = code;
3473     *ptrptr = ptr;
3474 nigel 93 if (lengthptr != NULL)
3475     {
3476 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3477     {
3478     *errorcodeptr = ERR20;
3479     goto FAILED;
3480     }
3481 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3482 nigel 93 DPRINTF((">> end branch\n"));
3483     }
3484 nigel 77 return TRUE;
3485    
3486 nigel 93
3487     /* ===================================================================*/
3488 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3489     the setting of any following char as a first character. */
3490    
3491 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3492 ph10 602 previous = NULL;
3493 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3494     {
3495     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3496 ph10 602 *code++ = OP_CIRCM;
3497 nigel 77 }
3498 ph10 602 else *code++ = OP_CIRC;
3499 nigel 77 break;
3500    
3501 ph10 391 case CHAR_DOLLAR_SIGN:
3502 nigel 77 previous = NULL;
3503 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3504 nigel 77 break;
3505    
3506     /* There can never be a first char if '.' is first, whatever happens about
3507     repeats. The value of reqbyte doesn't change either. */
3508    
3509 ph10 391 case CHAR_DOT:
3510 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3511     zerofirstbyte = firstbyte;
3512     zeroreqbyte = reqbyte;
3513     previous = code;
3514 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3515 nigel 77 break;
3516    
3517 nigel 93
3518     /* ===================================================================*/
3519 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3520     32-byte bitmap of the permitted characters, except in the special case
3521     where there is only one such character. For negated classes, we build the
3522     map as usual, then invert it at the end. However, we use a different opcode
3523     so that data characters > 255 can be handled correctly.
3524 nigel 77
3525     If the class contains characters outside the 0-255 range, a different
3526     opcode is compiled. It may optionally have a bit map for characters < 256,
3527     but those above are are explicitly listed afterwards. A flag byte tells
3528     whether the bitmap is present, and whether this is a negated class or not.
3529 ph10 345
3530 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3531     default (Perl) mode, it is treated as a data character. */
3532 ph10 345
3533 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3534 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3535     {
3536     *errorcodeptr = ERR64;
3537 ph10 345 goto FAILED;
3538 ph10 336 }
3539 ph10 345 goto NORMAL_CHAR;
3540 nigel 77
3541 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3542 nigel 77 previous = code;
3543    
3544     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3545     they are encountered at the top level, so we'll do that too. */
3546    
3547 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3548 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3549 ph10 295 check_posix_syntax(ptr, &tempptr))
3550 nigel 77 {
3551 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3552 nigel 77 goto FAILED;
3553     }
3554    
3555 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3556 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3557 ph10 205 skip them too. This makes for compatibility with Perl. */
3558 ph10 208
3559 ph10 205 negate_class = FALSE;
3560     for (;;)
3561 nigel 77 {
3562     c = *(++ptr);
3563 ph10 391 if (c == CHAR_BACKSLASH)
3564 ph10 205 {
3565 ph10 392 if (ptr[1] == CHAR_E)
3566 ph10 391 ptr++;
3567 ph10 392 else if (strncmp((const char *)ptr+1,
3568     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3569 ph10 391 ptr += 3;
3570 ph10 392 else
3571 ph10 391 break;
3572 ph10 205 }
3573 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3574 ph10 205 negate_class = TRUE;
3575     else break;
3576 ph10 208 }
3577 ph10 345
3578     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3579     an initial ']' is taken as a data character -- the code below handles
3580 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3581     [^] must match any character, so generate OP_ALLANY. */
3582 ph10 345
3583 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3584 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3585 ph10 341 {
3586     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3587     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3588     zerofirstbyte = firstbyte;
3589     break;
3590 ph10 345 }
3591 nigel 77
3592 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3593     negation flag at the end, so that support for characters > 255 works
3594 ph10 264 correctly (they are all included in the class). */
3595    
3596     should_flip_negation = FALSE;
3597    
3598 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3599 nigel 93 of just a single character (as long as it's < 256). However, For higher
3600     valued UTF-8 characters, we don't yet do any optimization. */
3601 nigel 77
3602     class_charcount = 0;
3603     class_lastchar = -1;
3604    
3605 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3606     temporary bit of memory, in case the class contains only 1 character (less
3607     than 256), because in that case the compiled code doesn't use the bit map.
3608     */
3609    
3610     memset(classbits, 0, 32 * sizeof(uschar));
3611    
3612 nigel 77 #ifdef SUPPORT_UTF8
3613     class_utf8 = FALSE; /* No chars >= 256 */
3614 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3615 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3616 nigel 77 #endif
3617    
3618     /* Process characters until ] is reached. By writing this as a "do" it
3619 nigel 93 means that an initial ] is taken as a data character. At the start of the
3620     loop, c contains the first byte of the character. */
3621 nigel 77
3622 nigel 93 if (c != 0) do
3623 nigel 77 {
3624 nigel 93 const uschar *oldptr;
3625    
3626 nigel 77 #ifdef SUPPORT_UTF8
3627     if (utf8 && c > 127)
3628     { /* Braces are required because the */
3629     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3630     }
3631 ph10 535
3632 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3633 ph10 309 data and reset the pointer. This is so that very large classes that
3634 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3635 ph10 309 (which is on the stack). */
3636    
3637 ph10 300 if (lengthptr != NULL)
3638     {
3639     *lengthptr += class_utf8data - class_utf8data_base;
3640 ph10 309 class_utf8data = class_utf8data_base;
3641     }
3642    
3643 nigel 77 #endif
3644    
3645     /* Inside \Q...\E everything is literal except \E */
3646    
3647     if (inescq)
3648     {
3649 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3650 nigel 77 {
3651 nigel 93 inescq = FALSE; /* Reset literal state */
3652     ptr++; /* Skip the 'E' */
3653     continue; /* Carry on with next */
3654 nigel 77 }
3655 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3656 nigel 77 }
3657    
3658     /* Handle POSIX class names. Perl allows a negation extension of the
3659     form [:^name:]. A square bracket that doesn't match the syntax is
3660     treated as a literal. We also recognize the POSIX constructions
3661     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3662     5.6 and 5.8 do. */
3663    
3664 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3665 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3666 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3667 nigel 77 {
3668     BOOL local_negate = FALSE;
3669 nigel 87 int posix_class, taboffset, tabopt;
3670 nigel 77 register const uschar *cbits = cd->cbits;
3671 nigel 87 uschar pbits[32];
3672 nigel 77
3673 ph10 391 if (ptr[1] != CHAR_COLON)
3674 nigel 77 {
3675     *errorcodeptr = ERR31;
3676     goto FAILED;
3677     }
3678    
3679     ptr += 2;
3680 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3681 nigel 77 {
3682     local_negate = TRUE;
3683 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3684 nigel 77 ptr++;
3685     }
3686    
3687 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3688 nigel 77 if (posix_class < 0)
3689     {
3690     *errorcodeptr = ERR30;
3691     goto FAILED;
3692     }
3693    
3694     /* If matching is caseless, upper and lower are converted to
3695     alpha. This relies on the fact that the class table starts with
3696     alpha, lower, upper as the first 3 entries. */
3697    
3698     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3699     posix_class = 0;
3700 ph10 535
3701     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3702 ph10 518 different escape sequences that use Unicode properties. */
3703 ph10 535
3704 ph10 518 #ifdef SUPPORT_UCP
3705     if ((options & PCRE_UCP) != 0)
3706     {
3707     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3708     if (posix_substitutes[pc] != NULL)
3709     {
3710 ph10 535 nestptr = tempptr + 1;
3711 ph10 518 ptr = posix_substitutes[pc] - 1;
3712 ph10 535 continue;
3713     }
3714     }
3715     #endif
3716 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3717     chunk of local store because we may be adding and subtracting from it,
3718     and we don't want to subtract bits that may be in the main map already.
3719     At the end we or the result into the bit map that is being built. */
3720 nigel 77
3721     posix_class *= 3;
3722 nigel 87
3723     /* Copy in the first table (always present) */
3724    
3725     memcpy(pbits, cbits + posix_class_maps[posix_class],
3726     32 * sizeof(uschar));
3727    
3728     /* If there is a second table, add or remove it as required. */
3729    
3730     taboffset = posix_class_maps[posix_class + 1];
3731     tabopt = posix_class_maps[posix_class + 2];
3732    
3733     if (taboffset >= 0)
3734 nigel 77 {
3735 nigel 87 if (tabopt >= 0)
3736     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3737 nigel 77 else
3738 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3739 nigel 77 }
3740    
3741 nigel 87 /* Not see if we need to remove any special characters. An option
3742     value of 1 removes vertical space and 2 removes underscore. */
3743    
3744     if (tabopt < 0) tabopt = -tabopt;
3745     if (tabopt == 1) pbits[1] &= ~0x3c;
3746     else if (tabopt == 2) pbits[11] &= 0x7f;
3747    
3748     /* Add the POSIX table or its complement into the main table that is
3749     being built and we are done. */
3750    
3751     if (local_negate)
3752     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3753     else
3754     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3755    
3756 nigel 77 ptr = tempptr + 1;
3757     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3758     continue; /* End of POSIX syntax handling */
3759     }
3760    
3761     /* Backslash may introduce a single character, or it may introduce one
3762 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3763 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3764     assume that other escapes have more than one character in them, so set
3765     class_charcount bigger than one. Unrecognized escapes fall through and
3766     are either treated as literal characters (by default), or are faulted if
3767     PCRE_EXTRA is set. */
3768 nigel 77
3769 ph10 391 if (c == CHAR_BACKSLASH)
3770 nigel 77 {
3771 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3772     if (*errorcodeptr != 0) goto FAILED;
3773 nigel 77
3774 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3775 ph10 758 else if (-c == ESC_N) /* \N is not supported in a class */
3776     {
3777     *errorcodeptr = ERR71;
3778     goto FAILED;
3779     }
3780 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3781     {
3782 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3783 nigel 77 {
3784     ptr += 2; /* avoid empty string */
3785     }
3786     else inescq = TRUE;
3787     continue;
3788     }
3789 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3790 nigel 77
3791     if (c < 0)
3792     {
3793     register const uschar *cbits = cd->cbits;
3794     class_charcount += 2; /* Greater than 1 is what matters */
3795 nigel 93
3796 ph10 518 switch (-c)
3797 nigel 77 {
3798 ph10 518 #ifdef SUPPORT_UCP
3799     case ESC_du: /* These are the values given for \d etc */
3800     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3801     case ESC_wu: /* escape sequence with an appropriate \p */
3802     case ESC_WU: /* or \P to test Unicode properties instead */
3803     case ESC_su: /* of the default ASCII testing. */
3804     case ESC_SU:
3805     nestptr = ptr;
3806     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3807 ph10 535 class_charcount -= 2; /* Undo! */
3808 ph10 518 continue;
3809     #endif
3810 nigel 77 case ESC_d:
3811     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3812     continue;
3813    
3814     case ESC_D:
3815 ph10 286 should_flip_negation = TRUE;
3816 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3817     continue;
3818    
3819     case ESC_w:
3820     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3821     continue;
3822    
3823     case ESC_W:
3824 ph10 286 should_flip_negation = TRUE;
3825 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3826     continue;
3827    
3828 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3829 ph10 579 if it was previously set by something earlier in the character
3830     class. */
3831 ph10 552
3832 nigel 77 case ESC_s:
3833 ph10 552 classbits[0] |= cbits[cbit_space];
3834 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3835 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3836 nigel 77 continue;
3837    
3838     case ESC_S:
3839 ph10 286 should_flip_negation = TRUE;
3840 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3841     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3842     continue;