/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 754 - (hide annotations) (download)
Sat Nov 19 18:32:18 2011 UTC (2 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 250841 byte(s)
Support \C in lookbehinds and DFA matching when not in UTF-8 mode.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
397 ph10 240 "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413 ph10 747 /* 70 */
414     "internal error: unknown opcode in find_fixedlength()\0"
415 ph10 510 ;
416 nigel 77
417     /* Table to identify digits and hex digits. This is used when compiling
418     patterns. Note that the tables in chartables are dependent on the locale, and
419     may mark arbitrary characters as digits - but the PCRE compiling code expects
420     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
421     a private table here. It costs 256 bytes, but it is a lot faster than doing
422     character value tests (at least in some simple cases I timed), and in some
423     applications one wants PCRE to compile efficiently as well as match
424     efficiently.
425    
426     For convenience, we use the same bit definitions as in chartables:
427    
428     0x04 decimal digit
429     0x08 hexadecimal digit
430    
431     Then we can use ctype_digit and ctype_xdigit in the code. */
432    
433 ph10 392 #ifndef EBCDIC
434 ph10 391
435 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
436 ph10 391 UTF-8 mode. */
437    
438 nigel 77 static const unsigned char digitab[] =
439     {
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
446     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
447     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
448     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
452     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
470     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
472    
473 ph10 392 #else
474 ph10 391
475     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
476    
477 nigel 77 static const unsigned char digitab[] =
478     {
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
490 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
491 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
495     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
503     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
507     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
508     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
509     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
510     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
511    
512     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
513     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
514     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
515     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
517     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
518     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
521     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
522     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
523     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
524 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
525 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
526     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
527     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
528     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
529     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
531     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
532     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
533     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
534     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
535     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
536     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
537     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
539     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
540     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
541     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
542     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
543     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
544     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
545     #endif
546    
547    
548     /* Definition to allow mutual recursion */
549    
550     static BOOL
551 ph10 642 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
552     int *, int *, branch_chain *, compile_data *, int *);
553 nigel 77
554    
555    
556     /*************************************************
557 ph10 240 * Find an error text *
558     *************************************************/
559    
560 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
561     some of the text is of unknown length, we can't use a table of offsets.
562     Instead, just count through the strings. This is not a performance issue
563 ph10 240 because it happens only when there has been a compilation error.
564    
565     Argument: the error number
566     Returns: pointer to the error string
567     */
568    
569     static const char *
570     find_error_text(int n)
571     {
572     const char *s = error_texts;
573 ph10 507 for (; n > 0; n--)
574 ph10 499 {
575     while (*s++ != 0) {};
576     if (*s == 0) return "Error text not found (please report)";
577 ph10 507 }
578 ph10 240 return s;
579     }
580    
581    
582     /*************************************************
583 ph10 640 * Check for counted repeat *
584     *************************************************/
585    
586     /* This function is called when a '{' is encountered in a place where it might
587     start a quantifier. It looks ahead to see if it really is a quantifier or not.
588     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
589     where the ddds are digits.
590    
591     Arguments:
592     p pointer to the first char after '{'
593    
594     Returns: TRUE or FALSE
595     */
596    
597     static BOOL
598     is_counted_repeat(const uschar *p)
599     {
600     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
601     while ((digitab[*p] & ctype_digit) != 0) p++;
602     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
603    
604     if (*p++ != CHAR_COMMA) return FALSE;
605     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
606    
607     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
608     while ((digitab[*p] & ctype_digit) != 0) p++;
609    
610     return (*p == CHAR_RIGHT_CURLY_BRACKET);
611     }
612    
613    
614    
615     /*************************************************
616 nigel 77 * Handle escapes *
617     *************************************************/
618    
619     /* This function is called when a \ has been encountered. It either returns a
620     positive value for a simple escape such as \n, or a negative value which
621 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
622     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
623     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
624     ptr is pointing at the \. On exit, it is on the final character of the escape
625     sequence.
626 nigel 77
627     Arguments:
628     ptrptr points to the pattern position pointer
629     errorcodeptr points to the errorcode variable
630     bracount number of previous extracting brackets
631     options the options bits
632     isclass TRUE if inside a character class
633    
634     Returns: zero or positive => a data character
635     negative => a special escape sequence
636 ph10 213 on error, errorcodeptr is set
637 nigel 77 */
638    
639     static int
640     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
641     int options, BOOL isclass)
642     {
643 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
644     const uschar *ptr = *ptrptr + 1;
645 nigel 77 int c, i;
646    
647 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
648     ptr--; /* Set pointer back to the last byte */
649    
650 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
651    
652     if (c == 0) *errorcodeptr = ERR1;
653    
654 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
655     in a table. A non-zero result is something that can be returned immediately.
656 nigel 77 Otherwise further processing may be required. */
657    
658 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
659     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
660     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
661 nigel 77
662 ph10 97 #else /* EBCDIC coding */
663 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
664 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
665     #endif
666    
667     /* Escapes that need further processing, or are illegal. */
668    
669     else
670     {
671     const uschar *oldptr;
672 nigel 93 BOOL braced, negated;
673    
674 nigel 77 switch (c)
675     {
676     /* A number of Perl escapes are not handled by PCRE. We give an explicit
677     error. */
678    
679 ph10 391 case CHAR_l:
680     case CHAR_L:
681 zherczeg 744 *errorcodeptr = ERR37;
682     break;
683    
684 ph10 391 case CHAR_u:
685 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
686     {
687     /* In JavaScript, \u must be followed by four hexadecimal numbers.
688     Otherwise it is a lowercase u letter. */
689     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
690     && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
691     {
692     c = 0;
693     for (i = 0; i < 4; ++i)
694     {
695     register int cc = *(++ptr);
696     #ifndef EBCDIC /* ASCII/UTF-8 coding */
697     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
698     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
699     #else /* EBCDIC coding */
700     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
701     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
702     #endif
703     }
704     }
705     }
706     else
707     *errorcodeptr = ERR37;
708     break;
709    
710 ph10 391 case CHAR_U:
711 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
712     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
713 nigel 77 break;
714    
715 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
716 ph10 640 class, \g must be followed by one of a number of specific things:
717 ph10 345
718 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
719     backreference. If negative, it is a relative backreference. This is a Perl
720     5.10 feature.
721 ph10 345
722 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
723     is part of Perl's movement towards a unified syntax for back references. As
724     this is synonymous with \k{name}, we fudge it up by pretending it really
725     was \k.
726 ph10 345
727     (3) For Oniguruma compatibility we also support \g followed by a name or a
728     number either in angle brackets or in single quotes. However, these are
729     (possibly recursive) subroutine calls, _not_ backreferences. Just return
730 ph10 333 the -ESC_g code (cf \k). */
731 nigel 93
732 ph10 391 case CHAR_g:
733 ph10 640 if (isclass) break;
734 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
735 ph10 333 {
736     c = -ESC_g;
737 ph10 345 break;
738     }
739 ph10 333
740     /* Handle the Perl-compatible cases */
741 ph10 345
742 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
743 nigel 93 {
744 ph10 171 const uschar *p;
745 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
746     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
747     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
748 ph10 171 {
749     c = -ESC_k;
750     break;
751 ph10 172 }
752 nigel 93 braced = TRUE;
753     ptr++;
754     }
755     else braced = FALSE;
756    
757 ph10 391 if (ptr[1] == CHAR_MINUS)
758 nigel 93 {
759     negated = TRUE;
760     ptr++;
761     }
762     else negated = FALSE;
763    
764     c = 0;
765     while ((digitab[ptr[1]] & ctype_digit) != 0)
766 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
767 ph10 220
768 ph10 333 if (c < 0) /* Integer overflow */
769 ph10 213 {
770     *errorcodeptr = ERR61;
771     break;
772 ph10 220 }
773 ph10 345
774 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
775 nigel 93 {
776     *errorcodeptr = ERR57;
777 ph10 213 break;
778 nigel 93 }
779 ph10 345
780 ph10 333 if (c == 0)
781     {
782     *errorcodeptr = ERR58;
783     break;
784 ph10 345 }
785 nigel 93
786     if (negated)
787     {
788     if (c > bracount)
789     {
790     *errorcodeptr = ERR15;
791 ph10 213 break;
792 nigel 93 }
793     c = bracount - (c - 1);
794     }
795    
796     c = -(ESC_REF + c);
797     break;
798    
799 nigel 77 /* The handling of escape sequences consisting of a string of digits
800     starting with one that is not zero is not straightforward. By experiment,
801     the way Perl works seems to be as follows:
802    
803     Outside a character class, the digits are read as a decimal number. If the
804     number is less than 10, or if there are that many previous extracting
805     left brackets, then it is a back reference. Otherwise, up to three octal
806     digits are read to form an escaped byte. Thus \123 is likely to be octal
807     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
808     value is greater than 377, the least significant 8 bits are taken. Inside a
809     character class, \ followed by a digit is always an octal number. */
810    
811 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
812     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
813 nigel 77
814     if (!isclass)
815     {
816     oldptr = ptr;
817 ph10 391 c -= CHAR_0;
818 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
819 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
820 ph10 333 if (c < 0) /* Integer overflow */
821 ph10 213 {
822     *errorcodeptr = ERR61;
823 ph10 220 break;
824     }
825 nigel 77 if (c < 10 || c <= bracount)
826     {
827     c = -(ESC_REF + c);
828     break;
829     }
830     ptr = oldptr; /* Put the pointer back and fall through */
831     }
832    
833     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
834     generates a binary zero byte and treats the digit as a following literal.
835     Thus we have to pull back the pointer by one. */
836    
837 ph10 391 if ((c = *ptr) >= CHAR_8)
838 nigel 77 {
839     ptr--;
840     c = 0;
841     break;
842     }
843    
844     /* \0 always starts an octal number, but we may drop through to here with a
845 nigel 91 larger first octal digit. The original code used just to take the least
846     significant 8 bits of octal numbers (I think this is what early Perls used
847     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
848     than 3 octal digits. */
849 nigel 77
850 ph10 391 case CHAR_0:
851     c -= CHAR_0;
852     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
853     c = c * 8 + *(++ptr) - CHAR_0;
854 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
855 nigel 77 break;
856    
857 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
858     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
859     treated as a data character. */
860 nigel 77
861 ph10 391 case CHAR_x:
862 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
863     {
864     /* In JavaScript, \x must be followed by two hexadecimal numbers.
865     Otherwise it is a lowercase x letter. */
866     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
867     {
868     c = 0;
869     for (i = 0; i < 2; ++i)
870     {
871     register int cc = *(++ptr);
872     #ifndef EBCDIC /* ASCII/UTF-8 coding */
873     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
874     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
875     #else /* EBCDIC coding */
876     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
877     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
878     #endif
879     }
880     }
881     break;
882     }
883    
884 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
885 nigel 77 {
886     const uschar *pt = ptr + 2;
887 nigel 87 int count = 0;
888    
889 nigel 77 c = 0;
890     while ((digitab[*pt] & ctype_xdigit) != 0)
891     {
892 nigel 87 register int cc = *pt++;
893 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
894 nigel 77 count++;
895 nigel 87
896 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
897     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
898     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
899 ph10 97 #else /* EBCDIC coding */
900 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
901     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
902 nigel 77 #endif
903     }
904 nigel 87
905 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
906 nigel 77 {
907 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
908 nigel 77 ptr = pt;
909     break;
910     }
911 nigel 87
912 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
913     recognize this construct; fall through to the normal \x handling. */
914     }
915    
916 nigel 87 /* Read just a single-byte hex-defined char */
917 nigel 77
918     c = 0;
919     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
920     {
921 ph10 391 int cc; /* Some compilers don't like */
922     cc = *(++ptr); /* ++ in initializers */
923     #ifndef EBCDIC /* ASCII/UTF-8 coding */
924     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
925     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
926 ph10 97 #else /* EBCDIC coding */
927 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
928     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
929 nigel 77 #endif
930     }
931     break;
932    
933 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
934 ph10 574 An error is given if the byte following \c is not an ASCII character. This
935     coding is ASCII-specific, but then the whole concept of \cx is
936 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
937 nigel 77
938 ph10 391 case CHAR_c:
939 nigel 77 c = *(++ptr);
940     if (c == 0)
941     {
942     *errorcodeptr = ERR2;
943 ph10 213 break;
944 nigel 77 }
945 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
946     if (c > 127) /* Excludes all non-ASCII in either mode */
947     {
948     *errorcodeptr = ERR68;
949 ph10 579 break;
950     }
951 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
952 nigel 77 c ^= 0x40;
953 ph10 574 #else /* EBCDIC coding */
954 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
955 nigel 77 c ^= 0xC0;
956     #endif
957     break;
958    
959     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
960 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
961     otherwise, for Perl compatibility, it is a literal. This code looks a bit
962     odd, but there used to be some cases other than the default, and there may
963     be again in future, so I haven't "optimized" it. */
964 nigel 77
965     default:
966     if ((options & PCRE_EXTRA) != 0) switch(c)
967     {
968     default:
969     *errorcodeptr = ERR3;
970     break;
971     }
972     break;
973     }
974     }
975 ph10 518
976     /* Perl supports \N{name} for character names, as well as plain \N for "not
977 ph10 654 newline". PCRE does not support \N{name}. However, it does support
978 ph10 640 quantification such as \N{2,3}. */
979 nigel 77
980 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
981     !is_counted_repeat(ptr+2))
982 ph10 518 *errorcodeptr = ERR37;
983 ph10 514
984 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
985    
986     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
987     c -= (ESC_DU - ESC_D);
988    
989     /* Set the pointer to the final character before returning. */
990    
991 nigel 77 *ptrptr = ptr;
992     return c;
993     }
994    
995    
996    
997     #ifdef SUPPORT_UCP
998     /*************************************************
999     * Handle \P and \p *
1000     *************************************************/
1001    
1002     /* This function is called after \P or \p has been encountered, provided that
1003     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1004     pointing at the P or p. On exit, it is pointing at the final character of the
1005     escape sequence.
1006    
1007     Argument:
1008     ptrptr points to the pattern position pointer
1009     negptr points to a boolean that is set TRUE for negation else FALSE
1010 nigel 87 dptr points to an int that is set to the detailed property value
1011 nigel 77 errorcodeptr points to the error code variable
1012    
1013 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1014 nigel 77 */
1015    
1016     static int
1017 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1018 nigel 77 {
1019     int c, i, bot, top;
1020     const uschar *ptr = *ptrptr;
1021 nigel 87 char name[32];
1022 nigel 77
1023     c = *(++ptr);
1024     if (c == 0) goto ERROR_RETURN;
1025    
1026     *negptr = FALSE;
1027    
1028 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1029     negation. */
1030 nigel 77
1031 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1032 nigel 77 {
1033 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1034 nigel 77 {
1035     *negptr = TRUE;
1036     ptr++;
1037     }
1038 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
1039 nigel 77 {
1040     c = *(++ptr);
1041     if (c == 0) goto ERROR_RETURN;
1042 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1043 nigel 77 name[i] = c;
1044     }
1045 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1046 nigel 77 name[i] = 0;
1047     }
1048    
1049     /* Otherwise there is just one following character */
1050    
1051     else
1052     {
1053     name[0] = c;
1054     name[1] = 0;
1055     }
1056    
1057     *ptrptr = ptr;
1058    
1059     /* Search for a recognized property name using binary chop */
1060    
1061     bot = 0;
1062     top = _pcre_utt_size;
1063    
1064     while (bot < top)
1065     {
1066 nigel 87 i = (bot + top) >> 1;
1067 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1068 nigel 87 if (c == 0)
1069     {
1070     *dptr = _pcre_utt[i].value;
1071     return _pcre_utt[i].type;
1072     }
1073 nigel 77 if (c > 0) bot = i + 1; else top = i;
1074     }
1075    
1076     *errorcodeptr = ERR47;
1077     *ptrptr = ptr;
1078     return -1;
1079    
1080     ERROR_RETURN:
1081     *errorcodeptr = ERR46;
1082     *ptrptr = ptr;
1083     return -1;
1084     }
1085     #endif
1086    
1087    
1088    
1089    
1090     /*************************************************
1091     * Read repeat counts *
1092     *************************************************/
1093    
1094     /* Read an item of the form {n,m} and return the values. This is called only
1095     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1096     so the syntax is guaranteed to be correct, but we need to check the values.
1097    
1098     Arguments:
1099     p pointer to first char after '{'
1100     minp pointer to int for min
1101     maxp pointer to int for max
1102     returned as -1 if no max
1103     errorcodeptr points to error code variable
1104    
1105     Returns: pointer to '}' on success;
1106     current ptr on error, with errorcodeptr set non-zero
1107     */
1108    
1109     static const uschar *
1110     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1111     {
1112     int min = 0;
1113     int max = -1;
1114    
1115 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1116     an integer overflow. */
1117    
1118 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1119 nigel 81 if (min < 0 || min > 65535)
1120     {
1121     *errorcodeptr = ERR5;
1122     return p;
1123     }
1124 nigel 77
1125 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1126     Also, max must not be less than min. */
1127    
1128 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1129 nigel 77 {
1130 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1131 nigel 77 {
1132     max = 0;
1133 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1134 nigel 81 if (max < 0 || max > 65535)
1135     {
1136     *errorcodeptr = ERR5;
1137     return p;
1138     }
1139 nigel 77 if (max < min)
1140     {
1141     *errorcodeptr = ERR4;
1142     return p;
1143     }
1144     }
1145     }
1146    
1147 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1148     '}'. */
1149 nigel 77
1150 nigel 81 *minp = min;
1151     *maxp = max;
1152 nigel 77 return p;
1153     }
1154    
1155    
1156    
1157     /*************************************************
1158 ph10 408 * Subroutine for finding forward reference *
1159 nigel 91 *************************************************/
1160    
1161 ph10 408 /* This recursive function is called only from find_parens() below. The
1162     top-level call starts at the beginning of the pattern. All other calls must
1163     start at a parenthesis. It scans along a pattern's text looking for capturing
1164 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1165     name it is given, it returns its number. Alternatively, if the name is NULL, it
1166 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1167     track of subpatterns that reset the capturing group numbers - the (?| feature.
1168 nigel 91
1169 ph10 578 This function was originally called only from the second pass, in which we know
1170     that if (?< or (?' or (?P< is encountered, the name will be correctly
1171     terminated because that is checked in the first pass. There is now one call to
1172     this function in the first pass, to check for a recursive back reference by
1173     name (so that we can make the whole group atomic). In this case, we need check
1174 ph10 579 only up to the current position in the pattern, and that is still OK because
1175     and previous occurrences will have been checked. To make this work, the test
1176     for "end of pattern" is a check against cd->end_pattern in the main loop,
1177 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1178 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1179     processing items within the loop are OK, because afterwards the main loop will
1180 ph10 578 terminate.)
1181    
1182 nigel 91 Arguments:
1183 ph10 408 ptrptr address of the current character pointer (updated)
1184 ph10 345 cd compile background data
1185 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1186     lorn name length, or subpattern number if name is NULL
1187     xmode TRUE if we are in /x mode
1188 ph10 579 utf8 TRUE if we are in UTF-8 mode
1189 ph10 411 count pointer to the current capturing subpattern number (updated)
1190 nigel 91
1191     Returns: the number of the named subpattern, or -1 if not found
1192     */
1193    
1194     static int
1195 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1196 ph10 556 BOOL xmode, BOOL utf8, int *count)
1197 nigel 91 {
1198 ph10 408 uschar *ptr = *ptrptr;
1199     int start_count = *count;
1200     int hwm_count = start_count;
1201     BOOL dup_parens = FALSE;
1202 nigel 93
1203 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1204 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1205    
1206     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1207     {
1208 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1209 ph10 545
1210 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1211 ph10 545
1212 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1213 ph10 408
1214 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1215 ph10 408 {
1216     *count += 1;
1217     if (name == NULL && *count == lorn) return *count;
1218 ph10 411 ptr++;
1219 ph10 408 }
1220    
1221 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1222     where the parenthesis numbers are duplicated. */
1223    
1224     else if (ptr[2] == CHAR_VERTICAL_LINE)
1225     {
1226     ptr += 3;
1227     dup_parens = TRUE;
1228     }
1229 ph10 545
1230 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1231    
1232     else if (ptr[2] == CHAR_NUMBER_SIGN)
1233     {
1234     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1235     goto FAIL_EXIT;
1236 ph10 545 }
1237 ph10 544
1238 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1239     is processed as normal. If not, skip to the closing parenthesis of the
1240 ph10 544 condition (there can't be any nested parens). */
1241 ph10 411
1242 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1243     {
1244 ph10 411 ptr += 2;
1245 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1246     {
1247     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1248 ph10 411 if (*ptr != 0) ptr++;
1249 ph10 408 }
1250 ph10 411 }
1251    
1252 ph10 544 /* Start with (? but not a condition. */
1253 ph10 408
1254     else
1255 ph10 411 {
1256 ph10 408 ptr += 2;
1257     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1258    
1259     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1260 ph10 411
1261 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1262     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1263     {
1264     int term;
1265     const uschar *thisname;
1266     *count += 1;
1267     if (name == NULL && *count == lorn) return *count;
1268     term = *ptr++;
1269     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1270     thisname = ptr;
1271     while (*ptr != term) ptr++;
1272     if (name != NULL && lorn == ptr - thisname &&
1273     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1274     return *count;
1275 ph10 461 term++;
1276 ph10 411 }
1277 ph10 408 }
1278 ph10 411 }
1279 ph10 408
1280 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1281 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1282     first-pass call when this value is temporarily adjusted to stop at the current
1283 ph10 578 position. So DO NOT change this to a test for binary zero. */
1284 ph10 408
1285 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1286 nigel 91 {
1287 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1288    
1289 ph10 391 if (*ptr == CHAR_BACKSLASH)
1290 nigel 93 {
1291 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1292 ph10 391 if (*ptr == CHAR_Q) for (;;)
1293 nigel 93 {
1294 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1295 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1296 ph10 391 if (*(++ptr) == CHAR_E) break;
1297 nigel 93 }
1298     continue;
1299     }
1300    
1301 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1302     are handled for real. If the first character is '^', skip it. Also, if the
1303     first few characters (either before or after ^) are \Q\E or \E we skip them
1304 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1305 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1306 nigel 93
1307 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1308 nigel 93 {
1309 ph10 340 BOOL negate_class = FALSE;
1310     for (;;)
1311     {
1312 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1313 ph10 340 {
1314 ph10 438 if (ptr[2] == CHAR_E)
1315     ptr+= 2;
1316     else if (strncmp((const char *)ptr+2,
1317 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1318 ph10 438 ptr += 4;
1319 ph10 392 else
1320 ph10 391 break;
1321 ph10 340 }
1322 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1323 ph10 461 {
1324 ph10 340 negate_class = TRUE;
1325 ph10 438 ptr++;
1326 ph10 461 }
1327 ph10 340 else break;
1328     }
1329    
1330     /* If the next character is ']', it is a data character that must be
1331 ph10 341 skipped, except in JavaScript compatibility mode. */
1332 ph10 345
1333 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1334 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1335 ph10 345 ptr++;
1336    
1337 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1338 nigel 93 {
1339 ph10 220 if (*ptr == 0) return -1;
1340 ph10 391 if (*ptr == CHAR_BACKSLASH)
1341 nigel 93 {
1342 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1343 ph10 391 if (*ptr == CHAR_Q) for (;;)
1344 nigel 93 {
1345 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1346 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1347 ph10 391 if (*(++ptr) == CHAR_E) break;
1348 nigel 93 }
1349     continue;
1350     }
1351     }
1352     continue;
1353     }
1354    
1355     /* Skip comments in /x mode */
1356    
1357 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1358 nigel 93 {
1359 ph10 579 ptr++;
1360 ph10 556 while (*ptr != 0)
1361     {
1362     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1363     ptr++;
1364 ph10 579 #ifdef SUPPORT_UTF8
1365 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1366     #endif
1367     }
1368 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1369 nigel 93 continue;
1370     }
1371    
1372 ph10 408 /* Check for the special metacharacters */
1373 ph10 411
1374 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1375 nigel 93 {
1376 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1377 ph10 408 if (rc > 0) return rc;
1378     if (*ptr == 0) goto FAIL_EXIT;
1379 nigel 93 }
1380 ph10 411
1381 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1382     {
1383     if (dup_parens && *count < hwm_count) *count = hwm_count;
1384 ph10 545 goto FAIL_EXIT;
1385 ph10 408 }
1386 ph10 411
1387     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1388 ph10 408 {
1389     if (*count > hwm_count) hwm_count = *count;
1390     *count = start_count;
1391 ph10 411 }
1392 ph10 408 }
1393 nigel 93
1394 ph10 408 FAIL_EXIT:
1395     *ptrptr = ptr;
1396     return -1;
1397     }
1398 nigel 93
1399    
1400    
1401    
1402 ph10 408 /*************************************************
1403     * Find forward referenced subpattern *
1404     *************************************************/
1405 nigel 93
1406 ph10 408 /* This function scans along a pattern's text looking for capturing
1407     subpatterns, and counting them. If it finds a named pattern that matches the
1408     name it is given, it returns its number. Alternatively, if the name is NULL, it
1409     returns when it reaches a given numbered subpattern. This is used for forward
1410     references to subpatterns. We used to be able to start this scan from the
1411     current compiling point, using the current count value from cd->bracount, and
1412     do it all in a single loop, but the addition of the possibility of duplicate
1413     subpattern numbers means that we have to scan from the very start, in order to
1414     take account of such duplicates, and to use a recursive function to keep track
1415     of the different types of group.
1416    
1417     Arguments:
1418     cd compile background data
1419     name name to seek, or NULL if seeking a numbered subpattern
1420     lorn name length, or subpattern number if name is NULL
1421     xmode TRUE if we are in /x mode
1422 ph10 579 utf8 TRUE if we are in UTF-8 mode
1423 ph10 408
1424     Returns: the number of the found subpattern, or -1 if not found
1425     */
1426    
1427     static int
1428 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1429     BOOL utf8)
1430 ph10 408 {
1431     uschar *ptr = (uschar *)cd->start_pattern;
1432     int count = 0;
1433     int rc;
1434    
1435     /* If the pattern does not start with an opening parenthesis, the first call
1436     to find_parens_sub() will scan right to the end (if necessary). However, if it
1437     does start with a parenthesis, find_parens_sub() will return when it hits the
1438     matching closing parens. That is why we have to have a loop. */
1439    
1440 ph10 411 for (;;)
1441     {
1442 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1443 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1444     }
1445    
1446 ph10 408 return rc;
1447 nigel 91 }
1448    
1449    
1450    
1451 ph10 408
1452 nigel 91 /*************************************************
1453 nigel 77 * Find first significant op code *
1454     *************************************************/
1455    
1456     /* This is called by several functions that scan a compiled expression looking
1457     for a fixed first character, or an anchoring op code etc. It skips over things
1458 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1459     forward and all backward assertions, and also the \b assertion; for others it
1460     does not.
1461 nigel 77
1462     Arguments:
1463     code pointer to the start of the group
1464     skipassert TRUE if certain assertions are to be skipped
1465    
1466     Returns: pointer to the first significant opcode
1467     */
1468    
1469     static const uschar*
1470 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1471 nigel 77 {
1472     for (;;)
1473     {
1474     switch ((int)*code)
1475     {
1476     case OP_ASSERT_NOT:
1477     case OP_ASSERTBACK:
1478     case OP_ASSERTBACK_NOT:
1479     if (!skipassert) return code;
1480     do code += GET(code, 1); while (*code == OP_ALT);
1481     code += _pcre_OP_lengths[*code];
1482     break;
1483    
1484     case OP_WORD_BOUNDARY:
1485     case OP_NOT_WORD_BOUNDARY:
1486     if (!skipassert) return code;
1487     /* Fall through */
1488    
1489     case OP_CALLOUT:
1490     case OP_CREF:
1491 ph10 459 case OP_NCREF:
1492 nigel 93 case OP_RREF:
1493 ph10 459 case OP_NRREF:
1494 nigel 93 case OP_DEF:
1495 nigel 77 code += _pcre_OP_lengths[*code];
1496     break;
1497    
1498     default:
1499     return code;
1500     }
1501     }
1502     /* Control never reaches here */
1503     }
1504    
1505    
1506    
1507    
1508     /*************************************************
1509 ph10 454 * Find the fixed length of a branch *
1510 nigel 77 *************************************************/
1511    
1512 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1513 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1514 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1515 ph10 454 temporarily terminated with OP_END when this function is called.
1516 nigel 77
1517 ph10 461 This function is called when a backward assertion is encountered, so that if it
1518     fails, the error message can point to the correct place in the pattern.
1519 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1520 ph10 461 because they can be forward references. We solve this by remembering this case
1521 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1522    
1523 nigel 77 Arguments:
1524     code points to the start of the pattern (the bracket)
1525 ph10 604 utf8 TRUE in UTF-8 mode
1526 ph10 461 atend TRUE if called when the pattern is complete
1527     cd the "compile data" structure
1528 nigel 77
1529 ph10 461 Returns: the fixed length,
1530 ph10 454 or -1 if there is no fixed length,
1531 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1532 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1533 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1534 nigel 77 */
1535    
1536     static int
1537 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1538 nigel 77 {
1539     int length = -1;
1540    
1541     register int branchlength = 0;
1542     register uschar *cc = code + 1 + LINK_SIZE;
1543    
1544     /* Scan along the opcodes for this branch. If we get to the end of the
1545     branch, check the length against that of the other branches. */
1546    
1547     for (;;)
1548     {
1549     int d;
1550 ph10 454 uschar *ce, *cs;
1551 nigel 77 register int op = *cc;
1552     switch (op)
1553     {
1554 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1555     OP_BRA (normal non-capturing bracket) because the other variants of these
1556     opcodes are all concerned with unlimited repeated groups, which of course
1557 ph10 747 are not of fixed length. */
1558 ph10 604
1559 nigel 93 case OP_CBRA:
1560 nigel 77 case OP_BRA:
1561     case OP_ONCE:
1562 ph10 733 case OP_ONCE_NC:
1563 nigel 77 case OP_COND:
1564 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1565 nigel 77 if (d < 0) return d;
1566     branchlength += d;
1567     do cc += GET(cc, 1); while (*cc == OP_ALT);
1568     cc += 1 + LINK_SIZE;
1569     break;
1570    
1571 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1572     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1573     an ALT. If it is END it's the end of the outer call. All can be handled by
1574     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1575     because they all imply an unlimited repeat. */
1576 nigel 77
1577     case OP_ALT:
1578     case OP_KET:
1579     case OP_END:
1580 ph10 747 case OP_ACCEPT:
1581     case OP_ASSERT_ACCEPT:
1582 nigel 77 if (length < 0) length = branchlength;
1583     else if (length != branchlength) return -1;
1584     if (*cc != OP_ALT) return length;
1585     cc += 1 + LINK_SIZE;
1586     branchlength = 0;
1587     break;
1588 ph10 461
1589 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1590     be OK. If the subroutine is a forward reference, we can't deal with
1591     it until the end of the pattern, so return -3. */
1592 ph10 461
1593 ph10 454 case OP_RECURSE:
1594     if (!atend) return -3;
1595     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1596     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1597     if (cc > cs && cc < ce) return -1; /* Recursion */
1598 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1599 ph10 461 if (d < 0) return d;
1600 ph10 454 branchlength += d;
1601     cc += 1 + LINK_SIZE;
1602 ph10 461 break;
1603 nigel 77
1604     /* Skip over assertive subpatterns */
1605    
1606     case OP_ASSERT:
1607     case OP_ASSERT_NOT:
1608     case OP_ASSERTBACK:
1609     case OP_ASSERTBACK_NOT:
1610     do cc += GET(cc, 1); while (*cc == OP_ALT);
1611     /* Fall through */
1612    
1613     /* Skip over things that don't match chars */
1614    
1615 ph10 747 case OP_MARK:
1616     case OP_PRUNE_ARG:
1617     case OP_SKIP_ARG:
1618     case OP_THEN_ARG:
1619     cc += cc[1] + _pcre_OP_lengths[*cc];
1620     break;
1621    
1622 nigel 77 case OP_CALLOUT:
1623     case OP_CIRC:
1624 ph10 602 case OP_CIRCM:
1625 ph10 747 case OP_CLOSE:
1626     case OP_COMMIT:
1627     case OP_CREF:
1628     case OP_DEF:
1629 nigel 77 case OP_DOLL:
1630 ph10 602 case OP_DOLLM:
1631 ph10 747 case OP_EOD:
1632     case OP_EODN:
1633     case OP_FAIL:
1634     case OP_NCREF:
1635     case OP_NRREF:
1636 nigel 77 case OP_NOT_WORD_BOUNDARY:
1637 ph10 747 case OP_PRUNE:
1638     case OP_REVERSE:
1639     case OP_RREF:
1640     case OP_SET_SOM:
1641     case OP_SKIP:
1642     case OP_SOD:
1643     case OP_SOM:
1644     case OP_THEN:
1645 nigel 77 case OP_WORD_BOUNDARY:
1646     cc += _pcre_OP_lengths[*cc];
1647     break;
1648    
1649     /* Handle literal characters */
1650    
1651     case OP_CHAR:
1652 ph10 602 case OP_CHARI:
1653 nigel 91 case OP_NOT:
1654 ph10 604 case OP_NOTI:
1655 nigel 77 branchlength++;
1656     cc += 2;
1657     #ifdef SUPPORT_UTF8
1658 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1659 nigel 77 #endif
1660     break;
1661    
1662     /* Handle exact repetitions. The count is already in characters, but we
1663     need to skip over a multibyte character in UTF8 mode. */
1664    
1665     case OP_EXACT:
1666 ph10 747 case OP_EXACTI:
1667     case OP_NOTEXACT:
1668     case OP_NOTEXACTI:
1669 nigel 77 branchlength += GET2(cc,1);
1670     cc += 4;
1671     #ifdef SUPPORT_UTF8
1672 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1673 nigel 77 #endif
1674     break;
1675    
1676     case OP_TYPEEXACT:
1677     branchlength += GET2(cc,1);
1678 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1679 nigel 77 cc += 4;
1680     break;
1681    
1682     /* Handle single-char matchers */
1683    
1684     case OP_PROP:
1685     case OP_NOTPROP:
1686 nigel 87 cc += 2;
1687 nigel 77 /* Fall through */
1688    
1689 ph10 747 case OP_HSPACE:
1690     case OP_VSPACE:
1691     case OP_NOT_HSPACE:
1692     case OP_NOT_VSPACE:
1693 nigel 77 case OP_NOT_DIGIT:
1694     case OP_DIGIT:
1695     case OP_NOT_WHITESPACE:
1696     case OP_WHITESPACE:
1697     case OP_NOT_WORDCHAR:
1698     case OP_WORDCHAR:
1699     case OP_ANY:
1700 ph10 342 case OP_ALLANY:
1701 nigel 77 branchlength++;
1702     cc++;
1703     break;
1704    
1705 ph10 754 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1706     otherwise \C is coded as OP_ALLANY. */
1707 nigel 77
1708     case OP_ANYBYTE:
1709     return -2;
1710    
1711     /* Check a class for variable quantification */
1712    
1713     #ifdef SUPPORT_UTF8
1714     case OP_XCLASS:
1715     cc += GET(cc, 1) - 33;
1716     /* Fall through */
1717     #endif
1718    
1719     case OP_CLASS:
1720     case OP_NCLASS:
1721     cc += 33;
1722    
1723     switch (*cc)
1724     {
1725 ph10 747 case OP_CRPLUS:
1726     case OP_CRMINPLUS:
1727 nigel 77 case OP_CRSTAR:
1728     case OP_CRMINSTAR:
1729     case OP_CRQUERY:
1730     case OP_CRMINQUERY:
1731     return -1;
1732    
1733     case OP_CRRANGE:
1734     case OP_CRMINRANGE:
1735     if (GET2(cc,1) != GET2(cc,3)) return -1;
1736     branchlength += GET2(cc,1);
1737     cc += 5;
1738     break;
1739    
1740     default:
1741     branchlength++;
1742     }
1743     break;
1744    
1745     /* Anything else is variable length */
1746    
1747 ph10 747 case OP_ANYNL:
1748     case OP_BRAMINZERO:
1749     case OP_BRAPOS:
1750     case OP_BRAPOSZERO:
1751     case OP_BRAZERO:
1752     case OP_CBRAPOS:
1753     case OP_EXTUNI:
1754     case OP_KETRMAX:
1755     case OP_KETRMIN:
1756     case OP_KETRPOS:
1757     case OP_MINPLUS:
1758     case OP_MINPLUSI:
1759     case OP_MINQUERY:
1760     case OP_MINQUERYI:
1761     case OP_MINSTAR:
1762     case OP_MINSTARI:
1763     case OP_MINUPTO:
1764     case OP_MINUPTOI:
1765     case OP_NOTMINPLUS:
1766     case OP_NOTMINPLUSI:
1767     case OP_NOTMINQUERY:
1768     case OP_NOTMINQUERYI:
1769     case OP_NOTMINSTAR:
1770     case OP_NOTMINSTARI:
1771     case OP_NOTMINUPTO:
1772     case OP_NOTMINUPTOI:
1773     case OP_NOTPLUS:
1774     case OP_NOTPLUSI:
1775     case OP_NOTPOSPLUS:
1776     case OP_NOTPOSPLUSI:
1777     case OP_NOTPOSQUERY:
1778     case OP_NOTPOSQUERYI:
1779     case OP_NOTPOSSTAR:
1780     case OP_NOTPOSSTARI:
1781     case OP_NOTPOSUPTO:
1782     case OP_NOTPOSUPTOI:
1783     case OP_NOTQUERY:
1784     case OP_NOTQUERYI:
1785     case OP_NOTSTAR:
1786     case OP_NOTSTARI:
1787     case OP_NOTUPTO:
1788     case OP_NOTUPTOI:
1789     case OP_PLUS:
1790     case OP_PLUSI:
1791     case OP_POSPLUS:
1792     case OP_POSPLUSI:
1793     case OP_POSQUERY:
1794     case OP_POSQUERYI:
1795     case OP_POSSTAR:
1796     case OP_POSSTARI:
1797     case OP_POSUPTO:
1798     case OP_POSUPTOI:
1799     case OP_QUERY:
1800     case OP_QUERYI:
1801     case OP_REF:
1802     case OP_REFI:
1803     case OP_SBRA:
1804     case OP_SBRAPOS:
1805     case OP_SCBRA:
1806     case OP_SCBRAPOS:
1807     case OP_SCOND:
1808     case OP_SKIPZERO:
1809     case OP_STAR:
1810     case OP_STARI:
1811     case OP_TYPEMINPLUS:
1812     case OP_TYPEMINQUERY:
1813     case OP_TYPEMINSTAR:
1814     case OP_TYPEMINUPTO:
1815     case OP_TYPEPLUS:
1816     case OP_TYPEPOSPLUS:
1817     case OP_TYPEPOSQUERY:
1818     case OP_TYPEPOSSTAR:
1819     case OP_TYPEPOSUPTO:
1820     case OP_TYPEQUERY:
1821     case OP_TYPESTAR:
1822     case OP_TYPEUPTO:
1823     case OP_UPTO:
1824     case OP_UPTOI:
1825     return -1;
1826    
1827     /* Catch unrecognized opcodes so that when new ones are added they
1828     are not forgotten, as has happened in the past. */
1829    
1830 nigel 77 default:
1831 ph10 747 return -4;
1832 nigel 77 }
1833     }
1834     /* Control never gets here */
1835     }
1836    
1837    
1838    
1839    
1840     /*************************************************
1841 ph10 454 * Scan compiled regex for specific bracket *
1842 nigel 77 *************************************************/
1843    
1844     /* This little function scans through a compiled pattern until it finds a
1845 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1846 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1847     so that it can be called from pcre_study() when finding the minimum matching
1848 ph10 455 length.
1849 nigel 77
1850     Arguments:
1851     code points to start of expression
1852     utf8 TRUE in UTF-8 mode
1853 ph10 454 number the required bracket number or negative to find a lookbehind
1854 nigel 77
1855     Returns: pointer to the opcode for the bracket, or NULL if not found
1856     */
1857    
1858 ph10 455 const uschar *
1859     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1860 nigel 77 {
1861     for (;;)
1862     {
1863     register int c = *code;
1864 ph10 618
1865 nigel 77 if (c == OP_END) return NULL;
1866 nigel 91
1867     /* XCLASS is used for classes that cannot be represented just by a bit
1868     map. This includes negated single high-valued characters. The length in
1869     the table is zero; the actual length is stored in the compiled code. */
1870    
1871     if (c == OP_XCLASS) code += GET(code, 1);
1872 ph10 461
1873 ph10 454 /* Handle recursion */
1874 ph10 461
1875 ph10 454 else if (c == OP_REVERSE)
1876     {
1877 ph10 461 if (number < 0) return (uschar *)code;
1878 ph10 454 code += _pcre_OP_lengths[c];
1879     }
1880 nigel 91
1881 nigel 93 /* Handle capturing bracket */
1882 nigel 91
1883 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1884     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1885 nigel 77 {
1886 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1887 nigel 77 if (n == number) return (uschar *)code;
1888 nigel 93 code += _pcre_OP_lengths[c];
1889 nigel 77 }
1890 nigel 91
1891 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1892     repeated character types, we have to test for \p and \P, which have an extra
1893 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1894 ph10 510 must add in its length. */
1895 nigel 91
1896 nigel 77 else
1897     {
1898 ph10 218 switch(c)
1899     {
1900     case OP_TYPESTAR:
1901     case OP_TYPEMINSTAR:
1902     case OP_TYPEPLUS:
1903     case OP_TYPEMINPLUS:
1904     case OP_TYPEQUERY:
1905     case OP_TYPEMINQUERY:
1906     case OP_TYPEPOSSTAR:
1907     case OP_TYPEPOSPLUS:
1908     case OP_TYPEPOSQUERY:
1909     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1910 ph10 220 break;
1911 ph10 221
1912     case OP_TYPEUPTO:
1913     case OP_TYPEMINUPTO:
1914     case OP_TYPEEXACT:
1915     case OP_TYPEPOSUPTO:
1916     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1917     break;
1918 ph10 512
1919 ph10 510 case OP_MARK:
1920     case OP_PRUNE_ARG:
1921     case OP_SKIP_ARG:
1922     code += code[1];
1923 ph10 512 break;
1924 ph10 550
1925     case OP_THEN_ARG:
1926 ph10 716 code += code[1];
1927 ph10 550 break;
1928 ph10 220 }
1929    
1930 ph10 218 /* Add in the fixed length from the table */
1931 ph10 220
1932 nigel 77 code += _pcre_OP_lengths[c];
1933 ph10 220
1934 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1935     a multi-byte character. The length in the table is a minimum, so we have to
1936     arrange to skip the extra bytes. */
1937 ph10 220
1938 ph10 107 #ifdef SUPPORT_UTF8
1939 nigel 77 if (utf8) switch(c)
1940     {
1941     case OP_CHAR:
1942 ph10 602 case OP_CHARI:
1943 nigel 77 case OP_EXACT:
1944 ph10 602 case OP_EXACTI:
1945 nigel 77 case OP_UPTO:
1946 ph10 602 case OP_UPTOI:
1947 nigel 77 case OP_MINUPTO:
1948 ph10 602 case OP_MINUPTOI:
1949 nigel 93 case OP_POSUPTO:
1950 ph10 602 case OP_POSUPTOI:
1951 nigel 77 case OP_STAR:
1952 ph10 602 case OP_STARI:
1953 nigel 77 case OP_MINSTAR:
1954 ph10 602 case OP_MINSTARI:
1955 nigel 93 case OP_POSSTAR:
1956 ph10 602 case OP_POSSTARI:
1957 nigel 77 case OP_PLUS:
1958 ph10 602 case OP_PLUSI:
1959 nigel 77 case OP_MINPLUS:
1960 ph10 602 case OP_MINPLUSI:
1961 nigel 93 case OP_POSPLUS:
1962 ph10 602 case OP_POSPLUSI:
1963 nigel 77 case OP_QUERY:
1964 ph10 602 case OP_QUERYI:
1965 nigel 77 case OP_MINQUERY:
1966 ph10 602 case OP_MINQUERYI:
1967 nigel 93 case OP_POSQUERY:
1968 ph10 602 case OP_POSQUERYI:
1969 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1970 nigel 77 break;
1971     }
1972 ph10 369 #else
1973     (void)(utf8); /* Keep compiler happy by referencing function argument */
1974 ph10 111 #endif
1975 nigel 77 }
1976     }
1977     }
1978    
1979    
1980    
1981     /*************************************************
1982     * Scan compiled regex for recursion reference *
1983     *************************************************/
1984    
1985     /* This little function scans through a compiled pattern until it finds an
1986     instance of OP_RECURSE.
1987    
1988     Arguments:
1989     code points to start of expression
1990     utf8 TRUE in UTF-8 mode
1991    
1992     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1993     */
1994    
1995     static const uschar *
1996     find_recurse(const uschar *code, BOOL utf8)
1997     {
1998     for (;;)
1999     {
2000     register int c = *code;
2001     if (c == OP_END) return NULL;
2002 nigel 91 if (c == OP_RECURSE) return code;
2003 ph10 220
2004 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2005     map. This includes negated single high-valued characters. The length in
2006     the table is zero; the actual length is stored in the compiled code. */
2007    
2008     if (c == OP_XCLASS) code += GET(code, 1);
2009    
2010 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2011     repeated character types, we have to test for \p and \P, which have an extra
2012 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2013 ph10 510 must add in its length. */
2014 nigel 91
2015 nigel 77 else
2016     {
2017 ph10 218 switch(c)
2018     {
2019     case OP_TYPESTAR:
2020     case OP_TYPEMINSTAR:
2021     case OP_TYPEPLUS:
2022     case OP_TYPEMINPLUS:
2023     case OP_TYPEQUERY:
2024     case OP_TYPEMINQUERY:
2025     case OP_TYPEPOSSTAR:
2026     case OP_TYPEPOSPLUS:
2027     case OP_TYPEPOSQUERY:
2028     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2029 ph10 220 break;
2030 ph10 221
2031     case OP_TYPEPOSUPTO:
2032     case OP_TYPEUPTO:
2033     case OP_TYPEMINUPTO:
2034     case OP_TYPEEXACT:
2035     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2036     break;
2037 ph10 512
2038 ph10 510 case OP_MARK:
2039     case OP_PRUNE_ARG:
2040     case OP_SKIP_ARG:
2041     code += code[1];
2042 ph10 512 break;
2043 ph10 550
2044     case OP_THEN_ARG:
2045 ph10 716 code += code[1];
2046 ph10 550 break;
2047 ph10 220 }
2048    
2049 ph10 218 /* Add in the fixed length from the table */
2050    
2051 nigel 77 code += _pcre_OP_lengths[c];
2052 ph10 220
2053 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2054     by a multi-byte character. The length in the table is a minimum, so we have
2055     to arrange to skip the extra bytes. */
2056 ph10 220
2057 ph10 107 #ifdef SUPPORT_UTF8
2058 nigel 77 if (utf8) switch(c)
2059     {
2060     case OP_CHAR:
2061 ph10 602 case OP_CHARI:
2062 nigel 77 case OP_EXACT:
2063 ph10 602 case OP_EXACTI:
2064 nigel 77 case OP_UPTO:
2065 ph10 602 case OP_UPTOI:
2066 nigel 77 case OP_MINUPTO:
2067 ph10 602 case OP_MINUPTOI:
2068 nigel 93 case OP_POSUPTO:
2069 ph10 602 case OP_POSUPTOI:
2070 nigel 77 case OP_STAR:
2071 ph10 602 case OP_STARI:
2072 nigel 77 case OP_MINSTAR:
2073 ph10 602 case OP_MINSTARI:
2074 nigel 93 case OP_POSSTAR:
2075 ph10 602 case OP_POSSTARI:
2076 nigel 77 case OP_PLUS:
2077 ph10 602 case OP_PLUSI:
2078 nigel 77 case OP_MINPLUS:
2079 ph10 602 case OP_MINPLUSI:
2080 nigel 93 case OP_POSPLUS:
2081 ph10 602 case OP_POSPLUSI:
2082 nigel 77 case OP_QUERY:
2083 ph10 602 case OP_QUERYI:
2084 nigel 77 case OP_MINQUERY:
2085 ph10 602 case OP_MINQUERYI:
2086 nigel 93 case OP_POSQUERY:
2087 ph10 602 case OP_POSQUERYI:
2088 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2089 nigel 77 break;
2090     }
2091 ph10 369 #else
2092     (void)(utf8); /* Keep compiler happy by referencing function argument */
2093 ph10 111 #endif
2094 nigel 77 }
2095     }
2096     }
2097    
2098    
2099    
2100     /*************************************************
2101     * Scan compiled branch for non-emptiness *
2102     *************************************************/
2103    
2104     /* This function scans through a branch of a compiled pattern to see whether it
2105 nigel 93 can match the empty string or not. It is called from could_be_empty()
2106     below and from compile_branch() when checking for an unlimited repeat of a
2107     group that can match nothing. Note that first_significant_code() skips over
2108 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2109     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2110     bracket whose current branch will already have been scanned.
2111 nigel 77
2112     Arguments:
2113     code points to start of search
2114     endcode points to where to stop
2115     utf8 TRUE if in UTF8 mode
2116 ph10 503 cd contains pointers to tables etc.
2117 nigel 77
2118     Returns: TRUE if what is matched could be empty
2119     */
2120    
2121     static BOOL
2122 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2123     compile_data *cd)
2124 nigel 77 {
2125     register int c;
2126 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2127 nigel 77 code < endcode;
2128 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2129 nigel 77 {
2130     const uschar *ccode;
2131    
2132     c = *code;
2133 ph10 507
2134 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2135 ph10 282 first_significant_code() with a TRUE final argument. */
2136 ph10 286
2137 ph10 282 if (c == OP_ASSERT)
2138 ph10 286 {
2139 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2140     c = *code;
2141     continue;
2142 ph10 286 }
2143 ph10 172
2144 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2145 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2146     forward reference subroutine call, we can't. To detect forward reference
2147 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2148     called only when doing the real compile, not during the pre-compile that
2149 ph10 624 measures the size of the compiled pattern. */
2150 ph10 507
2151 ph10 503 if (c == OP_RECURSE)
2152     {
2153 ph10 624 const uschar *scode;
2154     BOOL empty_branch;
2155 ph10 654
2156 ph10 624 /* Test for forward reference */
2157 ph10 654
2158 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2159 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2160 ph10 624
2161     /* Not a forward reference, test for completed backward reference */
2162 ph10 654
2163 ph10 624 empty_branch = FALSE;
2164     scode = cd->start_code + GET(code, 1);
2165 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2166 ph10 654
2167 ph10 624 /* Completed backwards reference */
2168 ph10 654
2169 ph10 503 do
2170     {
2171 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2172     {
2173     empty_branch = TRUE;
2174 ph10 507 break;
2175     }
2176 ph10 503 scode += GET(scode, 1);
2177     }
2178     while (*scode == OP_ALT);
2179 ph10 654
2180 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2181 ph10 503 continue;
2182 ph10 507 }
2183 ph10 170
2184 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2185    
2186     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2187     c == OP_BRAPOSZERO)
2188     {
2189     code += _pcre_OP_lengths[c];
2190     do code += GET(code, 1); while (*code == OP_ALT);
2191     c = *code;
2192     continue;
2193     }
2194    
2195     /* A nested group that is already marked as "could be empty" can just be
2196     skipped. */
2197    
2198     if (c == OP_SBRA || c == OP_SBRAPOS ||
2199     c == OP_SCBRA || c == OP_SCBRAPOS)
2200     {
2201     do code += GET(code, 1); while (*code == OP_ALT);
2202     c = *code;
2203     continue;
2204     }
2205    
2206 ph10 170 /* For other groups, scan the branches. */
2207 ph10 172
2208 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2209     c == OP_CBRA || c == OP_CBRAPOS ||
2210 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2211     c == OP_COND)
2212 nigel 77 {
2213     BOOL empty_branch;
2214     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2215 ph10 406
2216     /* If a conditional group has only one branch, there is a second, implied,
2217 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2218     Otherwise, scan the individual branches of the group. */
2219 ph10 406
2220 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2221 nigel 77 code += GET(code, 1);
2222 ph10 395 else
2223 ph10 406 {
2224 ph10 395 empty_branch = FALSE;
2225     do
2226     {
2227 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2228 ph10 395 empty_branch = TRUE;
2229     code += GET(code, 1);
2230     }
2231     while (*code == OP_ALT);
2232     if (!empty_branch) return FALSE; /* All branches are non-empty */
2233 nigel 77 }
2234 ph10 406
2235 ph10 172 c = *code;
2236 nigel 93 continue;
2237 nigel 77 }
2238    
2239 nigel 93 /* Handle the other opcodes */
2240    
2241     switch (c)
2242 nigel 77 {
2243 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2244     cannot be represented just by a bit map. This includes negated single
2245     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2246 ph10 220 actual length is stored in the compiled code, so we must update "code"
2247 ph10 216 here. */
2248 nigel 77
2249     #ifdef SUPPORT_UTF8
2250     case OP_XCLASS:
2251 ph10 216 ccode = code += GET(code, 1);
2252 nigel 77 goto CHECK_CLASS_REPEAT;
2253     #endif
2254    
2255     case OP_CLASS:
2256     case OP_NCLASS:
2257     ccode = code + 33;
2258    
2259     #ifdef SUPPORT_UTF8
2260     CHECK_CLASS_REPEAT:
2261     #endif
2262    
2263     switch (*ccode)
2264     {
2265     case OP_CRSTAR: /* These could be empty; continue */
2266     case OP_CRMINSTAR:
2267     case OP_CRQUERY:
2268     case OP_CRMINQUERY:
2269     break;
2270    
2271     default: /* Non-repeat => class must match */
2272     case OP_CRPLUS: /* These repeats aren't empty */
2273     case OP_CRMINPLUS:
2274     return FALSE;
2275    
2276     case OP_CRRANGE:
2277     case OP_CRMINRANGE:
2278     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2279     break;
2280     }
2281     break;
2282    
2283     /* Opcodes that must match a character */
2284    
2285     case OP_PROP:
2286     case OP_NOTPROP:
2287     case OP_EXTUNI:
2288     case OP_NOT_DIGIT:
2289     case OP_DIGIT:
2290     case OP_NOT_WHITESPACE:
2291     case OP_WHITESPACE:
2292     case OP_NOT_WORDCHAR:
2293     case OP_WORDCHAR:
2294     case OP_ANY:
2295 ph10 345 case OP_ALLANY:
2296 nigel 77 case OP_ANYBYTE:
2297     case OP_CHAR:
2298 ph10 602 case OP_CHARI:
2299 nigel 77 case OP_NOT:
2300 ph10 602 case OP_NOTI:
2301 nigel 77 case OP_PLUS:
2302     case OP_MINPLUS:
2303 nigel 93 case OP_POSPLUS:
2304 nigel 77 case OP_EXACT:
2305     case OP_NOTPLUS:
2306     case OP_NOTMINPLUS:
2307 nigel 93 case OP_NOTPOSPLUS:
2308 nigel 77 case OP_NOTEXACT:
2309     case OP_TYPEPLUS:
2310     case OP_TYPEMINPLUS:
2311 nigel 93 case OP_TYPEPOSPLUS:
2312 nigel 77 case OP_TYPEEXACT:
2313     return FALSE;
2314 ph10 227
2315     /* These are going to continue, as they may be empty, but we have to
2316     fudge the length for the \p and \P cases. */
2317    
2318 ph10 224 case OP_TYPESTAR:
2319     case OP_TYPEMINSTAR:
2320     case OP_TYPEPOSSTAR:
2321     case OP_TYPEQUERY:
2322     case OP_TYPEMINQUERY:
2323     case OP_TYPEPOSQUERY:
2324     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2325 ph10 227 break;
2326    
2327 ph10 224 /* Same for these */
2328 ph10 227
2329 ph10 224 case OP_TYPEUPTO:
2330     case OP_TYPEMINUPTO:
2331     case OP_TYPEPOSUPTO:
2332     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2333     break;
2334 nigel 77
2335     /* End of branch */
2336    
2337     case OP_KET:
2338     case OP_KETRMAX:
2339     case OP_KETRMIN:
2340 ph10 604 case OP_KETRPOS:
2341 nigel 77 case OP_ALT:
2342     return TRUE;
2343    
2344 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2345     MINUPTO, and POSUPTO may be followed by a multibyte character */
2346 nigel 77
2347     #ifdef SUPPORT_UTF8
2348     case OP_STAR:
2349 ph10 602 case OP_STARI:
2350 nigel 77 case OP_MINSTAR:
2351 ph10 602 case OP_MINSTARI:
2352 nigel 93 case OP_POSSTAR:
2353 ph10 602 case OP_POSSTARI:
2354 nigel 77 case OP_QUERY:
2355 ph10 602 case OP_QUERYI:
2356 nigel 77 case OP_MINQUERY:
2357 ph10 602 case OP_MINQUERYI:
2358 nigel 93 case OP_POSQUERY:
2359 ph10 602 case OP_POSQUERYI:
2360 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2361     break;
2362 ph10 461
2363 nigel 77 case OP_UPTO:
2364 ph10 602 case OP_UPTOI:
2365 nigel 77 case OP_MINUPTO:
2366 ph10 602 case OP_MINUPTOI:
2367 nigel 93 case OP_POSUPTO:
2368 ph10 602 case OP_POSUPTOI:
2369 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2370 nigel 77 break;
2371     #endif
2372 ph10 503
2373 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2374     string. */
2375    
2376     case OP_MARK:
2377     case OP_PRUNE_ARG:
2378     case OP_SKIP_ARG:
2379     code += code[1];
2380 ph10 512 break;
2381 ph10 510
2382 ph10 550 case OP_THEN_ARG:
2383 ph10 716 code += code[1];
2384 ph10 550 break;
2385    
2386 ph10 503 /* None of the remaining opcodes are required to match a character. */
2387 ph10 507
2388 ph10 503 default:
2389 ph10 507 break;
2390 nigel 77 }
2391     }
2392    
2393     return TRUE;
2394     }
2395    
2396    
2397    
2398     /*************************************************
2399     * Scan compiled regex for non-emptiness *
2400     *************************************************/
2401    
2402     /* This function is called to check for left recursive calls. We want to check
2403     the current branch of the current pattern to see if it could match the empty
2404     string. If it could, we must look outwards for branches at other levels,
2405     stopping when we pass beyond the bracket which is the subject of the recursion.
2406 ph10 654 This function is called only during the real compile, not during the
2407 ph10 624 pre-compile.
2408 nigel 77
2409     Arguments:
2410     code points to start of the recursion
2411     endcode points to where to stop (current RECURSE item)
2412     bcptr points to the chain of current (unclosed) branch starts
2413     utf8 TRUE if in UTF-8 mode
2414 ph10 507 cd pointers to tables etc
2415 nigel 77
2416     Returns: TRUE if what is matched could be empty
2417     */
2418    
2419     static BOOL
2420     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2421 ph10 503 BOOL utf8, compile_data *cd)
2422 nigel 77 {
2423 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2424 nigel 77 {
2425 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2426 ph10 475 return FALSE;
2427 nigel 77 bcptr = bcptr->outer;
2428     }
2429     return TRUE;
2430     }
2431    
2432    
2433    
2434     /*************************************************
2435     * Check for POSIX class syntax *
2436     *************************************************/
2437    
2438     /* This function is called when the sequence "[:" or "[." or "[=" is
2439 ph10 295 encountered in a character class. It checks whether this is followed by a
2440 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2441 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2442 nigel 77
2443 ph10 298 Originally, this function only recognized a sequence of letters between the
2444     terminators, but it seems that Perl recognizes any sequence of characters,
2445     though of course unknown POSIX names are subsequently rejected. Perl gives an
2446     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2447     didn't consider this to be a POSIX class. Likewise for [:1234:].
2448 ph10 295
2449 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2450     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2451     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2452     below handles the special case of \], but does not try to do any other escape
2453     processing. This makes it different from Perl for cases such as [:l\ower:]
2454 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2455 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2456 ph10 295 I think.
2457    
2458 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2459     It seems that the appearance of a nested POSIX class supersedes an apparent
2460     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2461 ph10 691 a digit.
2462 ph10 640
2463 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2464     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2465     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2466 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2467 ph10 661 names.
2468    
2469 ph10 295 Arguments:
2470 nigel 77 ptr pointer to the initial [
2471     endptr where to return the end pointer
2472    
2473     Returns: TRUE or FALSE
2474     */
2475    
2476     static BOOL
2477 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2478 nigel 77 {
2479     int terminator; /* Don't combine these lines; the Solaris cc */
2480     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2481 ph10 295 for (++ptr; *ptr != 0; ptr++)
2482 nigel 77 {
2483 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2484     ptr++;
2485 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2486 ph10 640 else
2487 ph10 298 {
2488 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2489 ph10 295 {
2490     *endptr = ptr;
2491     return TRUE;
2492 ph10 298 }
2493 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2494     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2495     ptr[1] == CHAR_EQUALS_SIGN) &&
2496     check_posix_syntax(ptr, endptr))
2497 ph10 654 return FALSE;
2498 ph10 298 }
2499     }
2500 nigel 77 return FALSE;
2501     }
2502    
2503    
2504    
2505    
2506     /*************************************************
2507     * Check POSIX class name *
2508     *************************************************/
2509    
2510     /* This function is called to check the name given in a POSIX-style class entry
2511     such as [:alnum:].
2512    
2513     Arguments:
2514     ptr points to the first letter
2515     len the length of the name
2516    
2517     Returns: a value representing the name, or -1 if unknown
2518     */
2519    
2520     static int
2521     check_posix_name(const uschar *ptr, int len)
2522     {
2523 ph10 240 const char *pn = posix_names;
2524 nigel 77 register int yield = 0;
2525     while (posix_name_lengths[yield] != 0)
2526     {
2527     if (len == posix_name_lengths[yield] &&
2528 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2529 ph10 243 pn += posix_name_lengths[yield] + 1;
2530 nigel 77 yield++;
2531     }
2532     return -1;
2533     }
2534    
2535    
2536     /*************************************************
2537     * Adjust OP_RECURSE items in repeated group *
2538     *************************************************/
2539    
2540     /* OP_RECURSE items contain an offset from the start of the regex to the group
2541     that is referenced. This means that groups can be replicated for fixed
2542     repetition simply by copying (because the recursion is allowed to refer to
2543     earlier groups that are outside the current group). However, when a group is
2544 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2545     inserted before it, after it has been compiled. This means that any OP_RECURSE
2546     items within it that refer to the group itself or any contained groups have to
2547     have their offsets adjusted. That one of the jobs of this function. Before it
2548     is called, the partially compiled regex must be temporarily terminated with
2549     OP_END.
2550 nigel 77
2551 nigel 93 This function has been extended with the possibility of forward references for
2552     recursions and subroutine calls. It must also check the list of such references
2553     for the group we are dealing with. If it finds that one of the recursions in
2554     the current group is on this list, it adjusts the offset in the list, not the
2555     value in the reference (which is a group number).
2556    
2557 nigel 77 Arguments:
2558     group points to the start of the group
2559     adjust the amount by which the group is to be moved
2560     utf8 TRUE in UTF-8 mode
2561     cd contains pointers to tables etc.
2562 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2563 nigel 77
2564     Returns: nothing
2565     */
2566    
2567     static void
2568 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2569     uschar *save_hwm)
2570 nigel 77 {
2571     uschar *ptr = group;
2572 ph10 224
2573 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2574     {
2575 nigel 93 int offset;
2576     uschar *hc;
2577    
2578     /* See if this recursion is on the forward reference list. If so, adjust the
2579     reference. */
2580 ph10 345
2581 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2582     {
2583     offset = GET(hc, 0);
2584     if (cd->start_code + offset == ptr + 1)
2585     {
2586     PUT(hc, 0, offset + adjust);
2587     break;
2588     }
2589     }
2590    
2591     /* Otherwise, adjust the recursion offset if it's after the start of this
2592     group. */
2593    
2594     if (hc >= cd->hwm)
2595     {
2596     offset = GET(ptr, 1);
2597     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2598     }
2599    
2600 nigel 77 ptr += 1 + LINK_SIZE;
2601     }
2602     }
2603    
2604    
2605    
2606     /*************************************************
2607     * Insert an automatic callout point *
2608     *************************************************/
2609    
2610     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2611     callout points before each pattern item.
2612    
2613     Arguments:
2614     code current code pointer
2615     ptr current pattern pointer
2616     cd pointers to tables etc
2617    
2618     Returns: new code pointer
2619     */
2620    
2621     static uschar *
2622     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2623     {
2624     *code++ = OP_CALLOUT;
2625     *code++ = 255;
2626 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2627     PUT(code, LINK_SIZE, 0); /* Default length */
2628 nigel 77 return code + 2*LINK_SIZE;
2629     }
2630    
2631    
2632    
2633     /*************************************************
2634     * Complete a callout item *
2635     *************************************************/
2636    
2637     /* A callout item contains the length of the next item in the pattern, which
2638     we can't fill in till after we have reached the relevant point. This is used
2639     for both automatic and manual callouts.
2640    
2641     Arguments:
2642     previous_callout points to previous callout item
2643     ptr current pattern pointer
2644     cd pointers to tables etc
2645    
2646     Returns: nothing
2647     */
2648    
2649     static void
2650     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2651     {
2652 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2653 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2654     }
2655    
2656    
2657    
2658     #ifdef SUPPORT_UCP
2659     /*************************************************
2660     * Get othercase range *
2661     *************************************************/
2662    
2663     /* This function is passed the start and end of a class range, in UTF-8 mode
2664     with UCP support. It searches up the characters, looking for internal ranges of
2665     characters in the "other" case. Each call returns the next one, updating the
2666     start address.
2667    
2668     Arguments:
2669     cptr points to starting character value; updated
2670     d end value
2671     ocptr where to put start of othercase range
2672     odptr where to put end of othercase range
2673    
2674     Yield: TRUE when range returned; FALSE when no more
2675     */
2676    
2677     static BOOL
2678 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2679     unsigned int *odptr)
2680 nigel 77 {
2681 nigel 93 unsigned int c, othercase, next;
2682 nigel 77
2683     for (c = *cptr; c <= d; c++)
2684 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2685 nigel 77
2686     if (c > d) return FALSE;
2687    
2688     *ocptr = othercase;
2689     next = othercase + 1;
2690    
2691     for (++c; c <= d; c++)
2692     {
2693 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2694 nigel 77 next++;
2695     }
2696    
2697     *odptr = next - 1;
2698     *cptr = c;
2699    
2700     return TRUE;
2701     }
2702 ph10 532
2703    
2704    
2705     /*************************************************
2706     * Check a character and a property *
2707     *************************************************/
2708    
2709     /* This function is called by check_auto_possessive() when a property item
2710     is adjacent to a fixed character.
2711    
2712     Arguments:
2713     c the character
2714     ptype the property type
2715     pdata the data for the type
2716     negated TRUE if it's a negated property (\P or \p{^)
2717 ph10 535
2718 ph10 532 Returns: TRUE if auto-possessifying is OK
2719 ph10 535 */
2720 ph10 532
2721     static BOOL
2722     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2723     {
2724     const ucd_record *prop = GET_UCD(c);
2725     switch(ptype)
2726     {
2727     case PT_LAMP:
2728     return (prop->chartype == ucp_Lu ||
2729     prop->chartype == ucp_Ll ||
2730     prop->chartype == ucp_Lt) == negated;
2731    
2732     case PT_GC:
2733     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2734    
2735     case PT_PC:
2736     return (pdata == prop->chartype) == negated;
2737    
2738     case PT_SC:
2739     return (pdata == prop->script) == negated;
2740    
2741     /* These are specials */
2742    
2743     case PT_ALNUM:
2744     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2745     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2746    
2747     case PT_SPACE: /* Perl space */
2748     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2749     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2750     == negated;
2751    
2752     case PT_PXSPACE: /* POSIX space */
2753     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2754     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2755     c == CHAR_FF || c == CHAR_CR)
2756     == negated;
2757    
2758     case PT_WORD:
2759     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2760     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2761     c == CHAR_UNDERSCORE) == negated;
2762     }
2763 ph10 535 return FALSE;
2764 ph10 532 }
2765 nigel 77 #endif /* SUPPORT_UCP */
2766    
2767    
2768 nigel 93
2769 nigel 77 /*************************************************
2770 nigel 93 * Check if auto-possessifying is possible *
2771     *************************************************/
2772    
2773     /* This function is called for unlimited repeats of certain items, to see
2774     whether the next thing could possibly match the repeated item. If not, it makes
2775     sense to automatically possessify the repeated item.
2776    
2777     Arguments:
2778 ph10 532 previous pointer to the repeated opcode
2779 nigel 93 utf8 TRUE in UTF-8 mode
2780     ptr next character in pattern
2781     options options bits
2782     cd contains pointers to tables etc.
2783    
2784     Returns: TRUE if possessifying is wanted
2785     */
2786    
2787     static BOOL
2788 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2789 ph10 532 int options, compile_data *cd)
2790 nigel 93 {
2791 ph10 532 int c, next;
2792     int op_code = *previous++;
2793 nigel 93
2794     /* Skip whitespace and comments in extended mode */
2795    
2796     if ((options & PCRE_EXTENDED) != 0)
2797     {
2798     for (;;)
2799     {
2800     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2801 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2802 nigel 93 {
2803 ph10 579 ptr++;
2804 ph10 556 while (*ptr != 0)
2805     {
2806 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2807 ph10 556 ptr++;
2808 ph10 579 #ifdef SUPPORT_UTF8
2809 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2810     #endif
2811     }
2812 nigel 93 }
2813     else break;
2814     }
2815     }
2816    
2817     /* If the next item is one that we can handle, get its value. A non-negative
2818     value is a character, a negative value is an escape value. */
2819    
2820 ph10 391 if (*ptr == CHAR_BACKSLASH)
2821 nigel 93 {
2822     int temperrorcode = 0;
2823     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2824     if (temperrorcode != 0) return FALSE;
2825     ptr++; /* Point after the escape sequence */
2826     }
2827    
2828     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2829     {
2830     #ifdef SUPPORT_UTF8
2831     if (utf8) { GETCHARINC(next, ptr); } else
2832     #endif
2833     next = *ptr++;
2834     }
2835    
2836     else return FALSE;
2837    
2838     /* Skip whitespace and comments in extended mode */
2839    
2840     if ((options & PCRE_EXTENDED) != 0)
2841     {
2842     for (;;)
2843     {
2844     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2845 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2846 nigel 93 {
2847 ph10 579 ptr++;
2848 ph10 556 while (*ptr != 0)
2849     {
2850 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2851 ph10 556 ptr++;
2852 ph10 579 #ifdef SUPPORT_UTF8
2853 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2854     #endif
2855     }
2856 nigel 93 }
2857     else break;
2858     }
2859     }
2860    
2861     /* If the next thing is itself optional, we have to give up. */
2862    
2863 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2864 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2865     return FALSE;
2866 nigel 93
2867 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2868     the next item is a character. */
2869 nigel 93
2870     if (next >= 0) switch(op_code)
2871     {
2872     case OP_CHAR:
2873 ph10 535 #ifdef SUPPORT_UTF8
2874 ph10 532 GETCHARTEST(c, previous);
2875 ph10 369 #else
2876 ph10 532 c = *previous;
2877 ph10 535 #endif
2878     return c != next;
2879 nigel 93
2880 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2881 nigel 93 Unicode property support, we can use it to test the other case of
2882     high-valued characters. */
2883    
2884 ph10 602 case OP_CHARI:
2885 ph10 535 #ifdef SUPPORT_UTF8
2886 ph10 532 GETCHARTEST(c, previous);
2887     #else
2888     c = *previous;
2889 ph10 535 #endif
2890 ph10 532 if (c == next) return FALSE;
2891 nigel 93 #ifdef SUPPORT_UTF8
2892     if (utf8)
2893     {
2894     unsigned int othercase;
2895     if (next < 128) othercase = cd->fcc[next]; else
2896     #ifdef SUPPORT_UCP
2897 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2898 nigel 93 #else
2899     othercase = NOTACHAR;
2900     #endif
2901 ph10 532 return (unsigned int)c != othercase;
2902 nigel 93 }
2903     else
2904     #endif /* SUPPORT_UTF8 */
2905 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2906 nigel 93
2907 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2908 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2909 ph10 602 an XCLASS instead. */
2910 nigel 93
2911     case OP_NOT:
2912 ph10 602 return (c = *previous) == next;
2913 ph10 604
2914     case OP_NOTI:
2915 ph10 532 if ((c = *previous) == next) return TRUE;
2916 nigel 93 #ifdef SUPPORT_UTF8
2917     if (utf8)
2918     {
2919     unsigned int othercase;
2920     if (next < 128) othercase = cd->fcc[next]; else
2921     #ifdef SUPPORT_UCP
2922 ph10 349 othercase = UCD_OTHERCASE(next);
2923 nigel 93 #else
2924     othercase = NOTACHAR;
2925     #endif
2926 ph10 532 return (unsigned int)c == othercase;
2927 nigel 93 }
2928     else
2929     #endif /* SUPPORT_UTF8 */
2930 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2931 nigel 93
2932 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2933     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2934    
2935 nigel 93 case OP_DIGIT:
2936     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2937    
2938     case OP_NOT_DIGIT:
2939     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2940    
2941     case OP_WHITESPACE:
2942     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2943    
2944     case OP_NOT_WHITESPACE:
2945     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2946    
2947     case OP_WORDCHAR:
2948     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2949    
2950     case OP_NOT_WORDCHAR:
2951     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2952    
2953 ph10 180 case OP_HSPACE:
2954     case OP_NOT_HSPACE:
2955     switch(next)
2956     {
2957     case 0x09:
2958     case 0x20:
2959     case 0xa0:
2960     case 0x1680:
2961     case 0x180e:
2962     case 0x2000:
2963     case 0x2001:
2964     case 0x2002:
2965     case 0x2003:
2966     case 0x2004:
2967     case 0x2005:
2968     case 0x2006:
2969     case 0x2007:
2970     case 0x2008:
2971     case 0x2009:
2972     case 0x200A:
2973     case 0x202f:
2974     case 0x205f:
2975     case 0x3000:
2976 ph10 528 return op_code == OP_NOT_HSPACE;
2977 ph10 180 default:
2978 ph10 528 return op_code != OP_NOT_HSPACE;
2979 ph10 180 }
2980    
2981 ph10 528 case OP_ANYNL:
2982 ph10 180 case OP_VSPACE:
2983     case OP_NOT_VSPACE:
2984     switch(next)
2985     {
2986     case 0x0a:
2987     case 0x0b:
2988     case 0x0c:
2989     case 0x0d:
2990     case 0x85:
2991     case 0x2028:
2992     case 0x2029:
2993 ph10 528 return op_code == OP_NOT_VSPACE;
2994 ph10 180 default:
2995 ph10 528 return op_code != OP_NOT_VSPACE;
2996 ph10 180 }
2997    
2998 ph10 532 #ifdef SUPPORT_UCP
2999     case OP_PROP:
3000     return check_char_prop(next, previous[0], previous[1], FALSE);
3001 ph10 535
3002 ph10 532 case OP_NOTPROP:
3003     return check_char_prop(next, previous[0], previous[1], TRUE);
3004     #endif
3005    
3006 nigel 93 default:
3007     return FALSE;
3008     }
3009    
3010    
3011 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3012     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3013     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3014     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3015 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3016 nigel 93
3017     switch(op_code)
3018     {
3019     case OP_CHAR:
3020 ph10 602 case OP_CHARI:
3021 ph10 535 #ifdef SUPPORT_UTF8
3022 ph10 532 GETCHARTEST(c, previous);
3023     #else
3024     c = *previous;
3025 ph10 535 #endif
3026 nigel 93 switch(-next)
3027     {
3028     case ESC_d:
3029 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3030 nigel 93
3031     case ESC_D:
3032 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3033 nigel 93
3034     case ESC_s:
3035 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3036 nigel 93
3037     case ESC_S:
3038 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3039 nigel 93
3040     case ESC_w:
3041 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3042 nigel 93
3043     case ESC_W:
3044 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3045 ph10 182
3046 ph10 180 case ESC_h:
3047     case ESC_H:
3048 ph10 532 switch(c)
3049 ph10 180 {
3050     case 0x09:
3051     case 0x20:
3052     case 0xa0:
3053     case 0x1680:
3054     case 0x180e:
3055     case 0x2000:
3056     case 0x2001:
3057     case 0x2002:
3058     case 0x2003:
3059     case 0x2004:
3060     case 0x2005:
3061     case 0x2006:
3062     case 0x2007:
3063     case 0x2008:
3064     case 0x2009:
3065     case 0x200A:
3066     case 0x202f:
3067     case 0x205f:
3068     case 0x3000:
3069     return -next != ESC_h;
3070     default:
3071     return -next == ESC_h;
3072 ph10 182 }
3073    
3074 ph10 180 case ESC_v:
3075     case ESC_V:
3076 ph10 532 switch(c)
3077 ph10 180 {
3078     case 0x0a:
3079     case 0x0b:
3080     case 0x0c:
3081     case 0x0d:
3082     case 0x85:
3083     case 0x2028:
3084     case 0x2029:
3085     return -next != ESC_v;
3086     default:
3087     return -next == ESC_v;
3088 ph10 182 }
3089 ph10 535
3090     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3091     their substitutions and process them. The result will always be either
3092 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3093 ph10 535
3094 ph10 532 #ifdef SUPPORT_UCP
3095     case ESC_du:
3096     case ESC_DU:
3097     case ESC_wu:
3098     case ESC_WU:
3099     case ESC_su:
3100     case ESC_SU:
3101     {
3102     int temperrorcode = 0;
3103     ptr = substitutes[-next - ESC_DU];
3104     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3105     if (temperrorcode != 0) return FALSE;
3106     ptr++; /* For compatibility */
3107     }
3108 ph10 535 /* Fall through */
3109 nigel 93
3110 ph10 532 case ESC_p:
3111     case ESC_P:
3112     {
3113     int ptype, pdata, errorcodeptr;
3114 ph10 535 BOOL negated;
3115    
3116 ph10 532 ptr--; /* Make ptr point at the p or P */
3117     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3118     if (ptype < 0) return FALSE;
3119     ptr++; /* Point past the final curly ket */
3120 ph10 535
3121 ph10 532 /* If the property item is optional, we have to give up. (When generated
3122     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3123     to the original \d etc. At this point, ptr will point to a zero byte. */
3124 ph10 535
3125 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3126     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3127     return FALSE;
3128 ph10 535
3129 ph10 532 /* Do the property check. */
3130 ph10 535
3131 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3132 ph10 535 }
3133 ph10 532 #endif
3134    
3135 nigel 93 default:
3136     return FALSE;
3137     }
3138    
3139 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3140     well. It means re-organizing the above code so as to get hold of the property
3141     values before switching on the op-code. However, I wonder how many patterns
3142     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3143     these op-codes are never generated.) */
3144    
3145 nigel 93 case OP_DIGIT:
3146 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3147 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3148 nigel 93
3149     case OP_NOT_DIGIT:
3150     return next == -ESC_d;
3151    
3152     case OP_WHITESPACE:
3153 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3154 nigel 93
3155     case OP_NOT_WHITESPACE:
3156 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3157 nigel 93
3158 ph10 180 case OP_HSPACE:
3159 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3160 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3161 ph10 180
3162     case OP_NOT_HSPACE:
3163     return next == -ESC_h;
3164 ph10 182
3165 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3166 ph10 535 case OP_ANYNL:
3167 ph10 182 case OP_VSPACE:
3168 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3169    
3170     case OP_NOT_VSPACE:
3171 ph10 528 return next == -ESC_v || next == -ESC_R;
3172 ph10 180
3173 nigel 93 case OP_WORDCHAR:
3174 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3175 ph10 528 next == -ESC_v || next == -ESC_R;
3176 nigel 93
3177     case OP_NOT_WORDCHAR:
3178     return next == -ESC_w || next == -ESC_d;
3179 ph10 182
3180 nigel 93 default:
3181     return FALSE;
3182     }
3183    
3184     /* Control does not reach here */
3185     }
3186    
3187    
3188    
3189     /*************************************************
3190 nigel 77 * Compile one branch *
3191     *************************************************/
3192    
3193 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3194 nigel 77 changed during the branch, the pointer is used to change the external options
3195 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3196     to find out the amount of memory needed, as well as during the real compile
3197     phase. The value of lengthptr distinguishes the two phases.
3198 nigel 77
3199     Arguments:
3200     optionsptr pointer to the option bits
3201     codeptr points to the pointer to the current code point
3202     ptrptr points to the current pattern pointer
3203     errorcodeptr points to error code variable
3204     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3205     reqbyteptr set to the last literal character required, else < 0
3206     bcptr points to current branch chain
3207 ph10 654 cond_depth conditional nesting depth
3208 nigel 77 cd contains pointers to tables etc.
3209 nigel 93 lengthptr NULL during the real compile phase
3210     points to length accumulator during pre-compile phase
3211 nigel 77
3212     Returns: TRUE on success
3213     FALSE, with *errorcodeptr set non-zero on error
3214     */
3215    
3216     static BOOL
3217 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3218     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3219 ph10 642 int cond_depth, compile_data *cd, int *lengthptr)
3220 nigel 77 {
3221     int repeat_type, op_type;
3222     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3223     int bravalue = 0;
3224     int greedy_default, greedy_non_default;
3225     int firstbyte, reqbyte;
3226     int zeroreqbyte, zerofirstbyte;
3227     int req_caseopt, reqvary, tempreqvary;
3228 ph10 635 int options = *optionsptr; /* May change dynamically */
3229 nigel 77 int after_manual_callout = 0;
3230 nigel 93 int length_prevgroup = 0;
3231 nigel 77 register int c;
3232     register uschar *code = *codeptr;
3233 nigel 93 uschar *last_code = code;
3234     uschar *orig_code = code;
3235 nigel 77 uschar *tempcode;
3236     BOOL inescq = FALSE;
3237     BOOL groupsetfirstbyte = FALSE;
3238     const uschar *ptr = *ptrptr;
3239     const uschar *tempptr;
3240 ph10 518 const uschar *nestptr = NULL;
3241 nigel 77 uschar *previous = NULL;
3242     uschar *previous_callout = NULL;
3243 nigel 93 uschar *save_hwm = NULL;
3244 nigel 77 uschar classbits[32];
3245    
3246 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3247 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3248 ph10 635 dynamically as we process the pattern. */
3249    
3250 nigel 77 #ifdef SUPPORT_UTF8
3251     BOOL class_utf8;
3252     BOOL utf8 = (options & PCRE_UTF8) != 0;
3253     uschar *class_utf8data;
3254 ph10 300 uschar *class_utf8data_base;
3255 nigel 77 uschar utf8_char[6];
3256     #else
3257     BOOL utf8 = FALSE;
3258     #endif
3259    
3260 ph10 475 #ifdef PCRE_DEBUG
3261 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3262     #endif
3263    
3264 nigel 77 /* Set up the default and non-default settings for greediness */
3265    
3266     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3267     greedy_non_default = greedy_default ^ 1;
3268    
3269     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3270     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3271     matches a non-fixed char first char; reqbyte just remains unset if we never
3272     find one.
3273    
3274     When we hit a repeat whose minimum is zero, we may have to adjust these values
3275     to take the zero repeat into account. This is implemented by setting them to
3276     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3277     item types that can be repeated set these backoff variables appropriately. */
3278    
3279     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3280    
3281     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3282     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3283     value > 255. It is added into the firstbyte or reqbyte variables to record the
3284     case status of the value. This is used only for ASCII characters. */
3285    
3286     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3287    
3288     /* Switch on next character until the end of the branch */
3289    
3290     for (;; ptr++)
3291     {
3292     BOOL negate_class;
3293 ph10 286 BOOL should_flip_negation;
3294 nigel 77 BOOL possessive_quantifier;
3295     BOOL is_quantifier;
3296 nigel 93 BOOL is_recurse;
3297 ph10 180 BOOL reset_bracount;
3298 nigel 77 int class_charcount;
3299     int class_lastchar;
3300     int newoptions;
3301     int recno;
3302 ph10 172 int refsign;
3303 nigel 77 int skipbytes;
3304     int subreqbyte;
3305     int subfirstbyte;
3306 nigel 93 int terminator;
3307 nigel 77 int mclength;
3308 ph10 733 int tempbracount;
3309 nigel 77 uschar mcbuffer[8];
3310    
3311 nigel 93 /* Get next byte in the pattern */
3312 nigel 77
3313     c = *ptr;
3314 ph10 345
3315 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3316 ph10 518 string. Nesting only happens one level deep. */
3317    
3318     if (c == 0 && nestptr != NULL)
3319     {
3320     ptr = nestptr;
3321     nestptr = NULL;
3322     c = *ptr;
3323     }
3324    
3325 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3326     previous cycle of this loop. */
3327    
3328     if (lengthptr != NULL)
3329     {
3330 ph10 475 #ifdef PCRE_DEBUG
3331 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3332     #endif
3333 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3334 nigel 93 {
3335     *errorcodeptr = ERR52;
3336     goto FAILED;
3337     }
3338    
3339     /* There is at least one situation where code goes backwards: this is the
3340     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3341     the class is simply eliminated. However, it is created first, so we have to
3342     allow memory for it. Therefore, don't ever reduce the length at this point.
3343     */
3344    
3345     if (code < last_code) code = last_code;
3346 ph10 202
3347     /* Paranoid check for integer overflow */
3348    
3349     if (OFLOW_MAX - *lengthptr < code - last_code)
3350     {
3351     *errorcodeptr = ERR20;
3352     goto FAILED;
3353     }
3354    
3355 ph10 530 *lengthptr += (int)(code - last_code);
3356 ph10 751 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3357     c));
3358 nigel 93
3359     /* If "previous" is set and it is not at the start of the work space, move
3360     it back to there, in order to avoid filling up the work space. Otherwise,
3361     if "previous" is NULL, reset the current code pointer to the start. */
3362    
3363     if (previous != NULL)
3364     {
3365     if (previous > orig_code)
3366     {
3367     memmove(orig_code, previous, code - previous);
3368     code -= previous - orig_code;
3369     previous = orig_code;
3370     }
3371     }
3372     else code = orig_code;
3373    
3374     /* Remember where this code item starts so we can pick up the length
3375     next time round. */
3376    
3377     last_code = code;
3378     }
3379    
3380     /* In the real compile phase, just check the workspace used by the forward
3381     reference list. */
3382    
3383 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3384 nigel 93 {
3385     *errorcodeptr = ERR52;
3386     goto FAILED;
3387     }
3388    
3389 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3390    
3391     if (inescq && c != 0)
3392     {
3393 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3394 nigel 77 {
3395     inescq = FALSE;
3396     ptr++;
3397     continue;
3398     }
3399     else
3400     {
3401     if (previous_callout != NULL)
3402     {
3403 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3404     complete_callout(previous_callout, ptr, cd);
3405 nigel 77 previous_callout = NULL;
3406     }
3407     if ((options & PCRE_AUTO_CALLOUT) != 0)
3408     {
3409     previous_callout = code;
3410     code = auto_callout(code, ptr, cd);
3411     }
3412     goto NORMAL_CHAR;
3413     }
3414     }
3415    
3416     /* Fill in length of a previous callout, except when the next thing is
3417     a quantifier. */
3418    
3419 ph10 392 is_quantifier =
3420 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3421     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3422 nigel 77
3423     if (!is_quantifier && previous_callout != NULL &&
3424     after_manual_callout-- <= 0)
3425     {
3426 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3427     complete_callout(previous_callout, ptr, cd);
3428 nigel 77 previous_callout = NULL;
3429     }
3430    
3431 ph10 635 /* In extended mode, skip white space and comments. */
3432 nigel 77
3433     if ((options & PCRE_EXTENDED) != 0)
3434     {
3435     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3436 ph10 391 if (c == CHAR_NUMBER_SIGN)
3437 nigel 77 {
3438 ph10 579 ptr++;
3439 ph10 556 while (*ptr != 0)
3440 nigel 91 {
3441 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3442 ph10 556 ptr++;
3443 ph10 579 #ifdef SUPPORT_UTF8
3444 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3445     #endif
3446 nigel 91 }
3447 nigel 93 if (*ptr != 0) continue;
3448    
3449 nigel 91 /* Else fall through to handle end of string */
3450     c = 0;
3451 nigel 77 }
3452     }
3453    
3454     /* No auto callout for quantifiers. */
3455    
3456     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3457     {
3458     previous_callout = code;
3459     code = auto_callout(code, ptr, cd);
3460     }
3461    
3462     switch(c)
3463     {
3464 nigel 93 /* ===================================================================*/
3465     case 0: /* The branch terminates at string end */
3466 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3467     case CHAR_RIGHT_PARENTHESIS:
3468 nigel 77 *firstbyteptr = firstbyte;
3469     *reqbyteptr = reqbyte;
3470     *codeptr = code;
3471     *ptrptr = ptr;
3472 nigel 93 if (lengthptr != NULL)
3473     {
3474 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3475     {
3476     *errorcodeptr = ERR20;
3477     goto FAILED;
3478     }
3479 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3480 nigel 93 DPRINTF((">> end branch\n"));
3481     }
3482 nigel 77 return TRUE;
3483    
3484 nigel 93
3485     /* ===================================================================*/
3486 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3487     the setting of any following char as a first character. */
3488    
3489 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3490 ph10 602 previous = NULL;
3491 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3492     {
3493     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3494 ph10 602 *code++ = OP_CIRCM;
3495 nigel 77 }
3496 ph10 602 else *code++ = OP_CIRC;
3497 nigel 77 break;
3498    
3499 ph10 391 case CHAR_DOLLAR_SIGN:
3500 nigel 77 previous = NULL;
3501 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3502 nigel 77 break;
3503    
3504     /* There can never be a first char if '.' is first, whatever happens about
3505     repeats. The value of reqbyte doesn't change either. */
3506    
3507 ph10 391 case CHAR_DOT:
3508 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3509     zerofirstbyte = firstbyte;
3510     zeroreqbyte = reqbyte;
3511     previous = code;
3512 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3513 nigel 77 break;
3514    
3515 nigel 93
3516     /* ===================================================================*/
3517 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3518     32-byte bitmap of the permitted characters, except in the special case
3519     where there is only one such character. For negated classes, we build the
3520     map as usual, then invert it at the end. However, we use a different opcode
3521     so that data characters > 255 can be handled correctly.
3522 nigel 77
3523     If the class contains characters outside the 0-255 range, a different
3524     opcode is compiled. It may optionally have a bit map for characters < 256,
3525     but those above are are explicitly listed afterwards. A flag byte tells
3526     whether the bitmap is present, and whether this is a negated class or not.
3527 ph10 345
3528 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3529     default (Perl) mode, it is treated as a data character. */
3530 ph10 345
3531 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3532 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3533     {
3534     *errorcodeptr = ERR64;
3535 ph10 345 goto FAILED;
3536 ph10 336 }
3537 ph10 345 goto NORMAL_CHAR;
3538 nigel 77
3539 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3540 nigel 77 previous = code;
3541    
3542     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3543     they are encountered at the top level, so we'll do that too. */
3544    
3545 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3546 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3547 ph10 295 check_posix_syntax(ptr, &tempptr))
3548 nigel 77 {
3549 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3550 nigel 77 goto FAILED;
3551     }
3552    
3553 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3554 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3555 ph10 205 skip them too. This makes for compatibility with Perl. */
3556 ph10 208
3557 ph10 205 negate_class = FALSE;
3558     for (;;)
3559 nigel 77 {
3560     c = *(++ptr);
3561 ph10 391 if (c == CHAR_BACKSLASH)
3562 ph10 205 {
3563 ph10 392 if (ptr[1] == CHAR_E)
3564 ph10 391 ptr++;
3565 ph10 392 else if (strncmp((const char *)ptr+1,
3566     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3567 ph10 391 ptr += 3;
3568 ph10 392 else
3569 ph10 391 break;
3570 ph10 205 }
3571 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3572 ph10 205 negate_class = TRUE;
3573     else break;
3574 ph10 208 }
3575 ph10 345
3576     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3577     an initial ']' is taken as a data character -- the code below handles
3578 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3579     [^] must match any character, so generate OP_ALLANY. */
3580 ph10 345
3581 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3582 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3583 ph10 341 {
3584     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3585     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3586     zerofirstbyte = firstbyte;
3587     break;
3588 ph10 345 }
3589 nigel 77
3590 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3591     negation flag at the end, so that support for characters > 255 works
3592 ph10 264 correctly (they are all included in the class). */
3593    
3594     should_flip_negation = FALSE;
3595    
3596 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3597 nigel 93 of just a single character (as long as it's < 256). However, For higher
3598     valued UTF-8 characters, we don't yet do any optimization. */
3599 nigel 77
3600     class_charcount = 0;
3601     class_lastchar = -1;
3602    
3603 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3604     temporary bit of memory, in case the class contains only 1 character (less
3605     than 256), because in that case the compiled code doesn't use the bit map.
3606     */
3607    
3608     memset(classbits, 0, 32 * sizeof(uschar));
3609    
3610 nigel 77 #ifdef SUPPORT_UTF8
3611     class_utf8 = FALSE; /* No chars >= 256 */
3612 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3613 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3614 nigel 77 #endif
3615    
3616     /* Process characters until ] is reached. By writing this as a "do" it
3617 nigel 93 means that an initial ] is taken as a data character. At the start of the
3618     loop, c contains the first byte of the character. */
3619 nigel 77
3620 nigel 93 if (c != 0) do
3621 nigel 77 {
3622 nigel 93 const uschar *oldptr;
3623    
3624 nigel 77 #ifdef SUPPORT_UTF8
3625     if (utf8 && c > 127)
3626     { /* Braces are required because the */
3627     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3628     }
3629 ph10 535
3630 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3631 ph10 309 data and reset the pointer. This is so that very large classes that
3632 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3633 ph10 309 (which is on the stack). */
3634    
3635 ph10 300 if (lengthptr != NULL)
3636     {
3637     *lengthptr += class_utf8data - class_utf8data_base;
3638 ph10 309 class_utf8data = class_utf8data_base;
3639     }
3640    
3641 nigel 77 #endif
3642    
3643     /* Inside \Q...\E everything is literal except \E */
3644    
3645     if (inescq)
3646     {
3647 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3648 nigel 77 {
3649 nigel 93 inescq = FALSE; /* Reset literal state */
3650     ptr++; /* Skip the 'E' */
3651     continue; /* Carry on with next */
3652 nigel 77 }
3653 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3654 nigel 77 }
3655    
3656     /* Handle POSIX class names. Perl allows a negation extension of the
3657     form [:^name:]. A square bracket that doesn't match the syntax is
3658     treated as a literal. We also recognize the POSIX constructions
3659     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3660     5.6 and 5.8 do. */
3661    
3662 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3663 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3664 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3665 nigel 77 {
3666     BOOL local_negate = FALSE;
3667 nigel 87 int posix_class, taboffset, tabopt;
3668 nigel 77 register const uschar *cbits = cd->cbits;
3669 nigel 87 uschar pbits[32];
3670 nigel 77
3671 ph10 391 if (ptr[1] != CHAR_COLON)
3672 nigel 77 {
3673     *errorcodeptr = ERR31;
3674     goto FAILED;
3675     }
3676    
3677     ptr += 2;
3678 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3679 nigel 77 {
3680     local_negate = TRUE;
3681 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3682 nigel 77 ptr++;
3683     }
3684    
3685 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3686 nigel 77 if (posix_class < 0)
3687     {
3688     *errorcodeptr = ERR30;
3689     goto FAILED;
3690     }
3691    
3692     /* If matching is caseless, upper and lower are converted to
3693     alpha. This relies on the fact that the class table starts with
3694     alpha, lower, upper as the first 3 entries. */
3695    
3696     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3697     posix_class = 0;
3698 ph10 535
3699     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3700 ph10 518 different escape sequences that use Unicode properties. */
3701 ph10 535
3702 ph10 518 #ifdef SUPPORT_UCP
3703     if ((options & PCRE_UCP) != 0)
3704     {
3705     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3706     if (posix_substitutes[pc] != NULL)
3707     {
3708 ph10 535 nestptr = tempptr + 1;
3709 ph10 518 ptr = posix_substitutes[pc] - 1;
3710 ph10 535 continue;
3711     }
3712     }
3713     #endif
3714 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3715     chunk of local store because we may be adding and subtracting from it,
3716     and we don't want to subtract bits that may be in the main map already.
3717     At the end we or the result into the bit map that is being built. */
3718 nigel 77
3719     posix_class *= 3;
3720 nigel 87
3721     /* Copy in the first table (always present) */
3722    
3723     memcpy(pbits, cbits + posix_class_maps[posix_class],
3724     32 * sizeof(uschar));
3725    
3726     /* If there is a second table, add or remove it as required. */
3727    
3728     taboffset = posix_class_maps[posix_class + 1];
3729     tabopt = posix_class_maps[posix_class + 2];
3730    
3731     if (taboffset >= 0)
3732 nigel 77 {
3733 nigel 87 if (tabopt >= 0)
3734     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3735 nigel 77 else
3736 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3737 nigel 77 }
3738    
3739 nigel 87 /* Not see if we need to remove any special characters. An option
3740     value of 1 removes vertical space and 2 removes underscore. */
3741    
3742     if (tabopt < 0) tabopt = -tabopt;
3743     if (tabopt == 1) pbits[1] &= ~0x3c;
3744     else if (tabopt == 2) pbits[11] &= 0x7f;
3745    
3746     /* Add the POSIX table or its complement into the main table that is
3747     being built and we are done. */
3748    
3749     if (local_negate)
3750     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3751     else
3752     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3753    
3754 nigel 77 ptr = tempptr + 1;
3755     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3756     continue; /* End of POSIX syntax handling */
3757     }
3758    
3759     /* Backslash may introduce a single character, or it may introduce one
3760 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3761 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3762     assume that other escapes have more than one character in them, so set
3763     class_charcount bigger than one. Unrecognized escapes fall through and
3764     are either treated as literal characters (by default), or are faulted if
3765     PCRE_EXTRA is set. */
3766 nigel 77
3767 ph10 391 if (c == CHAR_BACKSLASH)
3768 nigel 77 {
3769 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3770     if (*errorcodeptr != 0) goto FAILED;
3771 nigel 77
3772 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3773 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3774     {
3775 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3776 nigel 77 {
3777     ptr += 2; /* avoid empty string */
3778     }
3779     else inescq = TRUE;
3780     continue;
3781     }
3782 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3783 nigel 77
3784     if (c < 0)
3785     {
3786     register const uschar *cbits = cd->cbits;
3787     class_charcount += 2; /* Greater than 1 is what matters */
3788 nigel 93
3789 ph10 518 switch (-c)
3790 nigel 77 {
3791 ph10 518 #ifdef SUPPORT_UCP
3792     case ESC_du: /* These are the values given for \d etc */
3793     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3794     case ESC_wu: /* escape sequence with an appropriate \p */
3795     case ESC_WU: /* or \P to test Unicode properties instead */
3796     case ESC_su: /* of the default ASCII testing. */
3797     case ESC_SU:
3798     nestptr = ptr;
3799     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3800 ph10 535 class_charcount -= 2; /* Undo! */
3801 ph10 518 continue;
3802     #endif
3803 nigel 77 case ESC_d:
3804     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3805     continue;
3806    
3807     case ESC_D:
3808 ph10 286 should_flip_negation = TRUE;
3809 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3810     continue;
3811    
3812     case ESC_w:
3813     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3814     continue;
3815    
3816     case ESC_W:
3817 ph10 286 should_flip_negation = TRUE;
3818 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3819     continue;
3820    
3821 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3822 ph10 579 if it was previously set by something earlier in the character
3823     class. */
3824 ph10 552
3825 nigel 77 case ESC_s:
3826 ph10 552 classbits[0] |= cbits[cbit_space];
3827 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3828 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3829 nigel 77 continue;
3830    
3831     case ESC_S:
3832 ph10 286 should_flip_negation = TRUE;
3833 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3834     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3835     continue;
3836    
3837 ph10 518 case ESC_h:
3838 ph10 178 SETBIT(classbits, 0x09); /* VT */
3839     SETBIT(classbits, 0x20); /* SPACE */
3840 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3841 ph10 178 #ifdef SUPPORT_UTF8
3842     if (utf8)
3843 ph10 180 {
3844 ph10 178 class_utf8 = TRUE;
3845     *class_utf8data++ = XCL_SINGLE;
3846 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3847 ph10 178 *class_utf8data++ = XCL_SINGLE;
3848 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3849     *class_utf8data++ = XCL_RANGE;
3850     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3851     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3852 ph10 178 *class_utf8data++ = XCL_SINGLE;
3853 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3854 ph10 178 *class_utf8data++ = XCL_SINGLE;
3855 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3856