/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 642 - (hide annotations) (download)
Thu Jul 28 18:59:40 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 243683 byte(s)
Avoid false positive for infinite recursion by not checking conditionals at 
compile time, but add tests at runtime that also catch infinite mutual 
recursion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
397 ph10 240 "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 629 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413 ph10 510 ;
414 nigel 77
415     /* Table to identify digits and hex digits. This is used when compiling
416     patterns. Note that the tables in chartables are dependent on the locale, and
417     may mark arbitrary characters as digits - but the PCRE compiling code expects
418     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
419     a private table here. It costs 256 bytes, but it is a lot faster than doing
420     character value tests (at least in some simple cases I timed), and in some
421     applications one wants PCRE to compile efficiently as well as match
422     efficiently.
423    
424     For convenience, we use the same bit definitions as in chartables:
425    
426     0x04 decimal digit
427     0x08 hexadecimal digit
428    
429     Then we can use ctype_digit and ctype_xdigit in the code. */
430    
431 ph10 392 #ifndef EBCDIC
432 ph10 391
433 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
434 ph10 391 UTF-8 mode. */
435    
436 nigel 77 static const unsigned char digitab[] =
437     {
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
444     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
445     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
446     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
450     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
470    
471 ph10 392 #else
472 ph10 391
473     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
474    
475 nigel 77 static const unsigned char digitab[] =
476     {
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
488 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
489 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
493     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
501     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
507     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
508     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
509    
510     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
511     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
512     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
513     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
515     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
518     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
519     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
520     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
521     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
522 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
523 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
524     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
526     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
527     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
528     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
529     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
531     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
532     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
533     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
534     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
535     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
537     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
539     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
540     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
541     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
542     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
543     #endif
544    
545    
546     /* Definition to allow mutual recursion */
547    
548     static BOOL
549 ph10 642 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
550     int *, int *, branch_chain *, compile_data *, int *);
551 nigel 77
552    
553    
554     /*************************************************
555 ph10 240 * Find an error text *
556     *************************************************/
557    
558 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
559     some of the text is of unknown length, we can't use a table of offsets.
560     Instead, just count through the strings. This is not a performance issue
561 ph10 240 because it happens only when there has been a compilation error.
562    
563     Argument: the error number
564     Returns: pointer to the error string
565     */
566    
567     static const char *
568     find_error_text(int n)
569     {
570     const char *s = error_texts;
571 ph10 507 for (; n > 0; n--)
572 ph10 499 {
573     while (*s++ != 0) {};
574     if (*s == 0) return "Error text not found (please report)";
575 ph10 507 }
576 ph10 240 return s;
577     }
578    
579    
580     /*************************************************
581 ph10 640 * Check for counted repeat *
582     *************************************************/
583    
584     /* This function is called when a '{' is encountered in a place where it might
585     start a quantifier. It looks ahead to see if it really is a quantifier or not.
586     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
587     where the ddds are digits.
588    
589     Arguments:
590     p pointer to the first char after '{'
591    
592     Returns: TRUE or FALSE
593     */
594    
595     static BOOL
596     is_counted_repeat(const uschar *p)
597     {
598     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
599     while ((digitab[*p] & ctype_digit) != 0) p++;
600     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
601    
602     if (*p++ != CHAR_COMMA) return FALSE;
603     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
604    
605     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
606     while ((digitab[*p] & ctype_digit) != 0) p++;
607    
608     return (*p == CHAR_RIGHT_CURLY_BRACKET);
609     }
610    
611    
612    
613     /*************************************************
614 nigel 77 * Handle escapes *
615     *************************************************/
616    
617     /* This function is called when a \ has been encountered. It either returns a
618     positive value for a simple escape such as \n, or a negative value which
619 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
620     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
621     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
622     ptr is pointing at the \. On exit, it is on the final character of the escape
623     sequence.
624 nigel 77
625     Arguments:
626     ptrptr points to the pattern position pointer
627     errorcodeptr points to the errorcode variable
628     bracount number of previous extracting brackets
629     options the options bits
630     isclass TRUE if inside a character class
631    
632     Returns: zero or positive => a data character
633     negative => a special escape sequence
634 ph10 213 on error, errorcodeptr is set
635 nigel 77 */
636    
637     static int
638     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
639     int options, BOOL isclass)
640     {
641 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
642     const uschar *ptr = *ptrptr + 1;
643 nigel 77 int c, i;
644    
645 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
646     ptr--; /* Set pointer back to the last byte */
647    
648 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
649    
650     if (c == 0) *errorcodeptr = ERR1;
651    
652 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
653     in a table. A non-zero result is something that can be returned immediately.
654 nigel 77 Otherwise further processing may be required. */
655    
656 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
657     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
658     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
659 nigel 77
660 ph10 97 #else /* EBCDIC coding */
661 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
662 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
663     #endif
664    
665     /* Escapes that need further processing, or are illegal. */
666    
667     else
668     {
669     const uschar *oldptr;
670 nigel 93 BOOL braced, negated;
671    
672 nigel 77 switch (c)
673     {
674     /* A number of Perl escapes are not handled by PCRE. We give an explicit
675     error. */
676    
677 ph10 391 case CHAR_l:
678     case CHAR_L:
679     case CHAR_u:
680     case CHAR_U:
681 nigel 77 *errorcodeptr = ERR37;
682     break;
683    
684 ph10 640 /* In a character class, \g is just a literal "g". Outside a character
685     class, \g must be followed by one of a number of specific things:
686 ph10 345
687 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
688     backreference. If negative, it is a relative backreference. This is a Perl
689     5.10 feature.
690 ph10 345
691 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
692     is part of Perl's movement towards a unified syntax for back references. As
693     this is synonymous with \k{name}, we fudge it up by pretending it really
694     was \k.
695 ph10 345
696     (3) For Oniguruma compatibility we also support \g followed by a name or a
697     number either in angle brackets or in single quotes. However, these are
698     (possibly recursive) subroutine calls, _not_ backreferences. Just return
699 ph10 333 the -ESC_g code (cf \k). */
700 nigel 93
701 ph10 391 case CHAR_g:
702 ph10 640 if (isclass) break;
703 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
704 ph10 333 {
705     c = -ESC_g;
706 ph10 345 break;
707     }
708 ph10 333
709     /* Handle the Perl-compatible cases */
710 ph10 345
711 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
712 nigel 93 {
713 ph10 171 const uschar *p;
714 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
715     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
716     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
717 ph10 171 {
718     c = -ESC_k;
719     break;
720 ph10 172 }
721 nigel 93 braced = TRUE;
722     ptr++;
723     }
724     else braced = FALSE;
725    
726 ph10 391 if (ptr[1] == CHAR_MINUS)
727 nigel 93 {
728     negated = TRUE;
729     ptr++;
730     }
731     else negated = FALSE;
732    
733     c = 0;
734     while ((digitab[ptr[1]] & ctype_digit) != 0)
735 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
736 ph10 220
737 ph10 333 if (c < 0) /* Integer overflow */
738 ph10 213 {
739     *errorcodeptr = ERR61;
740     break;
741 ph10 220 }
742 ph10 345
743 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
744 nigel 93 {
745     *errorcodeptr = ERR57;
746 ph10 213 break;
747 nigel 93 }
748 ph10 345
749 ph10 333 if (c == 0)
750     {
751     *errorcodeptr = ERR58;
752     break;
753 ph10 345 }
754 nigel 93
755     if (negated)
756     {
757     if (c > bracount)
758     {
759     *errorcodeptr = ERR15;
760 ph10 213 break;
761 nigel 93 }
762     c = bracount - (c - 1);
763     }
764    
765     c = -(ESC_REF + c);
766     break;
767    
768 nigel 77 /* The handling of escape sequences consisting of a string of digits
769     starting with one that is not zero is not straightforward. By experiment,
770     the way Perl works seems to be as follows:
771    
772     Outside a character class, the digits are read as a decimal number. If the
773     number is less than 10, or if there are that many previous extracting
774     left brackets, then it is a back reference. Otherwise, up to three octal
775     digits are read to form an escaped byte. Thus \123 is likely to be octal
776     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
777     value is greater than 377, the least significant 8 bits are taken. Inside a
778     character class, \ followed by a digit is always an octal number. */
779    
780 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
781     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
782 nigel 77
783     if (!isclass)
784     {
785     oldptr = ptr;
786 ph10 391 c -= CHAR_0;
787 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
788 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
789 ph10 333 if (c < 0) /* Integer overflow */
790 ph10 213 {
791     *errorcodeptr = ERR61;
792 ph10 220 break;
793     }
794 nigel 77 if (c < 10 || c <= bracount)
795     {
796     c = -(ESC_REF + c);
797     break;
798     }
799     ptr = oldptr; /* Put the pointer back and fall through */
800     }
801    
802     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
803     generates a binary zero byte and treats the digit as a following literal.
804     Thus we have to pull back the pointer by one. */
805    
806 ph10 391 if ((c = *ptr) >= CHAR_8)
807 nigel 77 {
808     ptr--;
809     c = 0;
810     break;
811     }
812    
813     /* \0 always starts an octal number, but we may drop through to here with a
814 nigel 91 larger first octal digit. The original code used just to take the least
815     significant 8 bits of octal numbers (I think this is what early Perls used
816     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
817     than 3 octal digits. */
818 nigel 77
819 ph10 391 case CHAR_0:
820     c -= CHAR_0;
821     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
822     c = c * 8 + *(++ptr) - CHAR_0;
823 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
824 nigel 77 break;
825    
826 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
827     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
828     treated as a data character. */
829 nigel 77
830 ph10 391 case CHAR_x:
831     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
832 nigel 77 {
833     const uschar *pt = ptr + 2;
834 nigel 87 int count = 0;
835    
836 nigel 77 c = 0;
837     while ((digitab[*pt] & ctype_xdigit) != 0)
838     {
839 nigel 87 register int cc = *pt++;
840 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
841 nigel 77 count++;
842 nigel 87
843 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
844     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
845     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
846 ph10 97 #else /* EBCDIC coding */
847 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
848     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
849 nigel 77 #endif
850     }
851 nigel 87
852 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
853 nigel 77 {
854 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
855 nigel 77 ptr = pt;
856     break;
857     }
858 nigel 87
859 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
860     recognize this construct; fall through to the normal \x handling. */
861     }
862    
863 nigel 87 /* Read just a single-byte hex-defined char */
864 nigel 77
865     c = 0;
866     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
867     {
868 ph10 391 int cc; /* Some compilers don't like */
869     cc = *(++ptr); /* ++ in initializers */
870     #ifndef EBCDIC /* ASCII/UTF-8 coding */
871     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
872     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
873 ph10 97 #else /* EBCDIC coding */
874 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
875     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
876 nigel 77 #endif
877     }
878     break;
879    
880 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
881 ph10 574 An error is given if the byte following \c is not an ASCII character. This
882     coding is ASCII-specific, but then the whole concept of \cx is
883 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
884 nigel 77
885 ph10 391 case CHAR_c:
886 nigel 77 c = *(++ptr);
887     if (c == 0)
888     {
889     *errorcodeptr = ERR2;
890 ph10 213 break;
891 nigel 77 }
892 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
893     if (c > 127) /* Excludes all non-ASCII in either mode */
894     {
895     *errorcodeptr = ERR68;
896 ph10 579 break;
897     }
898 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
899 nigel 77 c ^= 0x40;
900 ph10 574 #else /* EBCDIC coding */
901 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
902 nigel 77 c ^= 0xC0;
903     #endif
904     break;
905    
906     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
907 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
908     otherwise, for Perl compatibility, it is a literal. This code looks a bit
909     odd, but there used to be some cases other than the default, and there may
910     be again in future, so I haven't "optimized" it. */
911 nigel 77
912     default:
913     if ((options & PCRE_EXTRA) != 0) switch(c)
914     {
915     default:
916     *errorcodeptr = ERR3;
917     break;
918     }
919     break;
920     }
921     }
922 ph10 518
923     /* Perl supports \N{name} for character names, as well as plain \N for "not
924 ph10 640 newline". PCRE does not support \N{name}. However, it does support
925     quantification such as \N{2,3}. */
926 nigel 77
927 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
928     !is_counted_repeat(ptr+2))
929 ph10 518 *errorcodeptr = ERR37;
930 ph10 514
931 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
932    
933     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
934     c -= (ESC_DU - ESC_D);
935    
936     /* Set the pointer to the final character before returning. */
937    
938 nigel 77 *ptrptr = ptr;
939     return c;
940     }
941    
942    
943    
944     #ifdef SUPPORT_UCP
945     /*************************************************
946     * Handle \P and \p *
947     *************************************************/
948    
949     /* This function is called after \P or \p has been encountered, provided that
950     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
951     pointing at the P or p. On exit, it is pointing at the final character of the
952     escape sequence.
953    
954     Argument:
955     ptrptr points to the pattern position pointer
956     negptr points to a boolean that is set TRUE for negation else FALSE
957 nigel 87 dptr points to an int that is set to the detailed property value
958 nigel 77 errorcodeptr points to the error code variable
959    
960 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
961 nigel 77 */
962    
963     static int
964 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
965 nigel 77 {
966     int c, i, bot, top;
967     const uschar *ptr = *ptrptr;
968 nigel 87 char name[32];
969 nigel 77
970     c = *(++ptr);
971     if (c == 0) goto ERROR_RETURN;
972    
973     *negptr = FALSE;
974    
975 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
976     negation. */
977 nigel 77
978 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
979 nigel 77 {
980 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
981 nigel 77 {
982     *negptr = TRUE;
983     ptr++;
984     }
985 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
986 nigel 77 {
987     c = *(++ptr);
988     if (c == 0) goto ERROR_RETURN;
989 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
990 nigel 77 name[i] = c;
991     }
992 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
993 nigel 77 name[i] = 0;
994     }
995    
996     /* Otherwise there is just one following character */
997    
998     else
999     {
1000     name[0] = c;
1001     name[1] = 0;
1002     }
1003    
1004     *ptrptr = ptr;
1005    
1006     /* Search for a recognized property name using binary chop */
1007    
1008     bot = 0;
1009     top = _pcre_utt_size;
1010    
1011     while (bot < top)
1012     {
1013 nigel 87 i = (bot + top) >> 1;
1014 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1015 nigel 87 if (c == 0)
1016     {
1017     *dptr = _pcre_utt[i].value;
1018     return _pcre_utt[i].type;
1019     }
1020 nigel 77 if (c > 0) bot = i + 1; else top = i;
1021     }
1022    
1023     *errorcodeptr = ERR47;
1024     *ptrptr = ptr;
1025     return -1;
1026    
1027     ERROR_RETURN:
1028     *errorcodeptr = ERR46;
1029     *ptrptr = ptr;
1030     return -1;
1031     }
1032     #endif
1033    
1034    
1035    
1036    
1037     /*************************************************
1038     * Read repeat counts *
1039     *************************************************/
1040    
1041     /* Read an item of the form {n,m} and return the values. This is called only
1042     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1043     so the syntax is guaranteed to be correct, but we need to check the values.
1044    
1045     Arguments:
1046     p pointer to first char after '{'
1047     minp pointer to int for min
1048     maxp pointer to int for max
1049     returned as -1 if no max
1050     errorcodeptr points to error code variable
1051    
1052     Returns: pointer to '}' on success;
1053     current ptr on error, with errorcodeptr set non-zero
1054     */
1055    
1056     static const uschar *
1057     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1058     {
1059     int min = 0;
1060     int max = -1;
1061    
1062 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1063     an integer overflow. */
1064    
1065 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1066 nigel 81 if (min < 0 || min > 65535)
1067     {
1068     *errorcodeptr = ERR5;
1069     return p;
1070     }
1071 nigel 77
1072 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1073     Also, max must not be less than min. */
1074    
1075 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1076 nigel 77 {
1077 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1078 nigel 77 {
1079     max = 0;
1080 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1081 nigel 81 if (max < 0 || max > 65535)
1082     {
1083     *errorcodeptr = ERR5;
1084     return p;
1085     }
1086 nigel 77 if (max < min)
1087     {
1088     *errorcodeptr = ERR4;
1089     return p;
1090     }
1091     }
1092     }
1093    
1094 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1095     '}'. */
1096 nigel 77
1097 nigel 81 *minp = min;
1098     *maxp = max;
1099 nigel 77 return p;
1100     }
1101    
1102    
1103    
1104     /*************************************************
1105 ph10 408 * Subroutine for finding forward reference *
1106 nigel 91 *************************************************/
1107    
1108 ph10 408 /* This recursive function is called only from find_parens() below. The
1109     top-level call starts at the beginning of the pattern. All other calls must
1110     start at a parenthesis. It scans along a pattern's text looking for capturing
1111 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1112     name it is given, it returns its number. Alternatively, if the name is NULL, it
1113 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1114     track of subpatterns that reset the capturing group numbers - the (?| feature.
1115 nigel 91
1116 ph10 578 This function was originally called only from the second pass, in which we know
1117     that if (?< or (?' or (?P< is encountered, the name will be correctly
1118     terminated because that is checked in the first pass. There is now one call to
1119     this function in the first pass, to check for a recursive back reference by
1120     name (so that we can make the whole group atomic). In this case, we need check
1121 ph10 579 only up to the current position in the pattern, and that is still OK because
1122     and previous occurrences will have been checked. To make this work, the test
1123     for "end of pattern" is a check against cd->end_pattern in the main loop,
1124 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1125 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1126     processing items within the loop are OK, because afterwards the main loop will
1127 ph10 578 terminate.)
1128    
1129 nigel 91 Arguments:
1130 ph10 408 ptrptr address of the current character pointer (updated)
1131 ph10 345 cd compile background data
1132 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1133     lorn name length, or subpattern number if name is NULL
1134     xmode TRUE if we are in /x mode
1135 ph10 579 utf8 TRUE if we are in UTF-8 mode
1136 ph10 411 count pointer to the current capturing subpattern number (updated)
1137 nigel 91
1138     Returns: the number of the named subpattern, or -1 if not found
1139     */
1140    
1141     static int
1142 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1143 ph10 556 BOOL xmode, BOOL utf8, int *count)
1144 nigel 91 {
1145 ph10 408 uschar *ptr = *ptrptr;
1146     int start_count = *count;
1147     int hwm_count = start_count;
1148     BOOL dup_parens = FALSE;
1149 nigel 93
1150 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1151 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1152    
1153     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1154     {
1155 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1156 ph10 545
1157 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1158 ph10 545
1159 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1160 ph10 408
1161 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1162 ph10 408 {
1163     *count += 1;
1164     if (name == NULL && *count == lorn) return *count;
1165 ph10 411 ptr++;
1166 ph10 408 }
1167    
1168 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1169     where the parenthesis numbers are duplicated. */
1170    
1171     else if (ptr[2] == CHAR_VERTICAL_LINE)
1172     {
1173     ptr += 3;
1174     dup_parens = TRUE;
1175     }
1176 ph10 545
1177 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1178    
1179     else if (ptr[2] == CHAR_NUMBER_SIGN)
1180     {
1181     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1182     goto FAIL_EXIT;
1183 ph10 545 }
1184 ph10 544
1185 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1186     is processed as normal. If not, skip to the closing parenthesis of the
1187 ph10 544 condition (there can't be any nested parens). */
1188 ph10 411
1189 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1190     {
1191 ph10 411 ptr += 2;
1192 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1193     {
1194     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1195 ph10 411 if (*ptr != 0) ptr++;
1196 ph10 408 }
1197 ph10 411 }
1198    
1199 ph10 544 /* Start with (? but not a condition. */
1200 ph10 408
1201     else
1202 ph10 411 {
1203 ph10 408 ptr += 2;
1204     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1205    
1206     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1207 ph10 411
1208 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1209     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1210     {
1211     int term;
1212     const uschar *thisname;
1213     *count += 1;
1214     if (name == NULL && *count == lorn) return *count;
1215     term = *ptr++;
1216     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1217     thisname = ptr;
1218     while (*ptr != term) ptr++;
1219     if (name != NULL && lorn == ptr - thisname &&
1220     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1221     return *count;
1222 ph10 461 term++;
1223 ph10 411 }
1224 ph10 408 }
1225 ph10 411 }
1226 ph10 408
1227 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1228 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1229     first-pass call when this value is temporarily adjusted to stop at the current
1230 ph10 578 position. So DO NOT change this to a test for binary zero. */
1231 ph10 408
1232 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1233 nigel 91 {
1234 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1235    
1236 ph10 391 if (*ptr == CHAR_BACKSLASH)
1237 nigel 93 {
1238 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1239 ph10 391 if (*ptr == CHAR_Q) for (;;)
1240 nigel 93 {
1241 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1242 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1243 ph10 391 if (*(++ptr) == CHAR_E) break;
1244 nigel 93 }
1245     continue;
1246     }
1247    
1248 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1249     are handled for real. If the first character is '^', skip it. Also, if the
1250     first few characters (either before or after ^) are \Q\E or \E we skip them
1251 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1252 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1253 nigel 93
1254 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1255 nigel 93 {
1256 ph10 340 BOOL negate_class = FALSE;
1257     for (;;)
1258     {
1259 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1260 ph10 340 {
1261 ph10 438 if (ptr[2] == CHAR_E)
1262     ptr+= 2;
1263     else if (strncmp((const char *)ptr+2,
1264 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1265 ph10 438 ptr += 4;
1266 ph10 392 else
1267 ph10 391 break;
1268 ph10 340 }
1269 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1270 ph10 461 {
1271 ph10 340 negate_class = TRUE;
1272 ph10 438 ptr++;
1273 ph10 461 }
1274 ph10 340 else break;
1275     }
1276    
1277     /* If the next character is ']', it is a data character that must be
1278 ph10 341 skipped, except in JavaScript compatibility mode. */
1279 ph10 345
1280 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1281 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1282 ph10 345 ptr++;
1283    
1284 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1285 nigel 93 {
1286 ph10 220 if (*ptr == 0) return -1;
1287 ph10 391 if (*ptr == CHAR_BACKSLASH)
1288 nigel 93 {
1289 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1290 ph10 391 if (*ptr == CHAR_Q) for (;;)
1291 nigel 93 {
1292 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1293 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1294 ph10 391 if (*(++ptr) == CHAR_E) break;
1295 nigel 93 }
1296     continue;
1297     }
1298     }
1299     continue;
1300     }
1301    
1302     /* Skip comments in /x mode */
1303    
1304 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1305 nigel 93 {
1306 ph10 579 ptr++;
1307 ph10 556 while (*ptr != 0)
1308     {
1309     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1310     ptr++;
1311 ph10 579 #ifdef SUPPORT_UTF8
1312 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1313     #endif
1314     }
1315 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1316 nigel 93 continue;
1317     }
1318    
1319 ph10 408 /* Check for the special metacharacters */
1320 ph10 411
1321 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1322 nigel 93 {
1323 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1324 ph10 408 if (rc > 0) return rc;
1325     if (*ptr == 0) goto FAIL_EXIT;
1326 nigel 93 }
1327 ph10 411
1328 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1329     {
1330     if (dup_parens && *count < hwm_count) *count = hwm_count;
1331 ph10 545 goto FAIL_EXIT;
1332 ph10 408 }
1333 ph10 411
1334     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1335 ph10 408 {
1336     if (*count > hwm_count) hwm_count = *count;
1337     *count = start_count;
1338 ph10 411 }
1339 ph10 408 }
1340 nigel 93
1341 ph10 408 FAIL_EXIT:
1342     *ptrptr = ptr;
1343     return -1;
1344     }
1345 nigel 93
1346    
1347    
1348    
1349 ph10 408 /*************************************************
1350     * Find forward referenced subpattern *
1351     *************************************************/
1352 nigel 93
1353 ph10 408 /* This function scans along a pattern's text looking for capturing
1354     subpatterns, and counting them. If it finds a named pattern that matches the
1355     name it is given, it returns its number. Alternatively, if the name is NULL, it
1356     returns when it reaches a given numbered subpattern. This is used for forward
1357     references to subpatterns. We used to be able to start this scan from the
1358     current compiling point, using the current count value from cd->bracount, and
1359     do it all in a single loop, but the addition of the possibility of duplicate
1360     subpattern numbers means that we have to scan from the very start, in order to
1361     take account of such duplicates, and to use a recursive function to keep track
1362     of the different types of group.
1363    
1364     Arguments:
1365     cd compile background data
1366     name name to seek, or NULL if seeking a numbered subpattern
1367     lorn name length, or subpattern number if name is NULL
1368     xmode TRUE if we are in /x mode
1369 ph10 579 utf8 TRUE if we are in UTF-8 mode
1370 ph10 408
1371     Returns: the number of the found subpattern, or -1 if not found
1372     */
1373    
1374     static int
1375 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1376     BOOL utf8)
1377 ph10 408 {
1378     uschar *ptr = (uschar *)cd->start_pattern;
1379     int count = 0;
1380     int rc;
1381    
1382     /* If the pattern does not start with an opening parenthesis, the first call
1383     to find_parens_sub() will scan right to the end (if necessary). However, if it
1384     does start with a parenthesis, find_parens_sub() will return when it hits the
1385     matching closing parens. That is why we have to have a loop. */
1386    
1387 ph10 411 for (;;)
1388     {
1389 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1390 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1391     }
1392    
1393 ph10 408 return rc;
1394 nigel 91 }
1395    
1396    
1397    
1398 ph10 408
1399 nigel 91 /*************************************************
1400 nigel 77 * Find first significant op code *
1401     *************************************************/
1402    
1403     /* This is called by several functions that scan a compiled expression looking
1404     for a fixed first character, or an anchoring op code etc. It skips over things
1405 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1406     forward and all backward assertions, and also the \b assertion; for others it
1407     does not.
1408 nigel 77
1409     Arguments:
1410     code pointer to the start of the group
1411     skipassert TRUE if certain assertions are to be skipped
1412    
1413     Returns: pointer to the first significant opcode
1414     */
1415    
1416     static const uschar*
1417 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1418 nigel 77 {
1419     for (;;)
1420     {
1421     switch ((int)*code)
1422     {
1423     case OP_ASSERT_NOT:
1424     case OP_ASSERTBACK:
1425     case OP_ASSERTBACK_NOT:
1426     if (!skipassert) return code;
1427     do code += GET(code, 1); while (*code == OP_ALT);
1428     code += _pcre_OP_lengths[*code];
1429     break;
1430    
1431     case OP_WORD_BOUNDARY:
1432     case OP_NOT_WORD_BOUNDARY:
1433     if (!skipassert) return code;
1434     /* Fall through */
1435    
1436     case OP_CALLOUT:
1437     case OP_CREF:
1438 ph10 459 case OP_NCREF:
1439 nigel 93 case OP_RREF:
1440 ph10 459 case OP_NRREF:
1441 nigel 93 case OP_DEF:
1442 nigel 77 code += _pcre_OP_lengths[*code];
1443     break;
1444    
1445     default:
1446     return code;
1447     }
1448     }
1449     /* Control never reaches here */
1450     }
1451    
1452    
1453    
1454    
1455     /*************************************************
1456 ph10 454 * Find the fixed length of a branch *
1457 nigel 77 *************************************************/
1458    
1459 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1460 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1461 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1462 ph10 454 temporarily terminated with OP_END when this function is called.
1463 nigel 77
1464 ph10 461 This function is called when a backward assertion is encountered, so that if it
1465     fails, the error message can point to the correct place in the pattern.
1466 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1467 ph10 461 because they can be forward references. We solve this by remembering this case
1468 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1469    
1470 nigel 77 Arguments:
1471     code points to the start of the pattern (the bracket)
1472 ph10 604 utf8 TRUE in UTF-8 mode
1473 ph10 461 atend TRUE if called when the pattern is complete
1474     cd the "compile data" structure
1475 nigel 77
1476 ph10 461 Returns: the fixed length,
1477 ph10 454 or -1 if there is no fixed length,
1478 nigel 77 or -2 if \C was encountered
1479 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1480 nigel 77 */
1481    
1482     static int
1483 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1484 nigel 77 {
1485     int length = -1;
1486    
1487     register int branchlength = 0;
1488     register uschar *cc = code + 1 + LINK_SIZE;
1489    
1490     /* Scan along the opcodes for this branch. If we get to the end of the
1491     branch, check the length against that of the other branches. */
1492    
1493     for (;;)
1494     {
1495     int d;
1496 ph10 454 uschar *ce, *cs;
1497 nigel 77 register int op = *cc;
1498     switch (op)
1499     {
1500 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1501     OP_BRA (normal non-capturing bracket) because the other variants of these
1502     opcodes are all concerned with unlimited repeated groups, which of course
1503     are not of fixed length. They will cause a -1 response from the default
1504     case of this switch. */
1505    
1506 nigel 93 case OP_CBRA:
1507 nigel 77 case OP_BRA:
1508     case OP_ONCE:
1509     case OP_COND:
1510 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1511 nigel 77 if (d < 0) return d;
1512     branchlength += d;
1513     do cc += GET(cc, 1); while (*cc == OP_ALT);
1514     cc += 1 + LINK_SIZE;
1515     break;
1516    
1517     /* Reached end of a branch; if it's a ket it is the end of a nested
1518     call. If it's ALT it is an alternation in a nested call. If it is
1519 ph10 604 END it's the end of the outer call. All can be handled by the same code.
1520     Note that we must not include the OP_KETRxxx opcodes here, because they
1521     all imply an unlimited repeat. */
1522 nigel 77
1523     case OP_ALT:
1524     case OP_KET:
1525     case OP_END:
1526     if (length < 0) length = branchlength;
1527     else if (length != branchlength) return -1;
1528     if (*cc != OP_ALT) return length;
1529     cc += 1 + LINK_SIZE;
1530     branchlength = 0;
1531     break;
1532 ph10 461
1533 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1534     be OK. If the subroutine is a forward reference, we can't deal with
1535     it until the end of the pattern, so return -3. */
1536 ph10 461
1537 ph10 454 case OP_RECURSE:
1538     if (!atend) return -3;
1539     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1540     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1541     if (cc > cs && cc < ce) return -1; /* Recursion */
1542 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1543 ph10 461 if (d < 0) return d;
1544 ph10 454 branchlength += d;
1545     cc += 1 + LINK_SIZE;
1546 ph10 461 break;
1547 nigel 77
1548     /* Skip over assertive subpatterns */
1549    
1550     case OP_ASSERT:
1551     case OP_ASSERT_NOT:
1552     case OP_ASSERTBACK:
1553     case OP_ASSERTBACK_NOT:
1554     do cc += GET(cc, 1); while (*cc == OP_ALT);
1555     /* Fall through */
1556    
1557     /* Skip over things that don't match chars */
1558    
1559     case OP_REVERSE:
1560     case OP_CREF:
1561 ph10 459 case OP_NCREF:
1562 nigel 93 case OP_RREF:
1563 ph10 459 case OP_NRREF:
1564 nigel 93 case OP_DEF:
1565 nigel 77 case OP_CALLOUT:
1566     case OP_SOD:
1567     case OP_SOM:
1568 ph10 500 case OP_SET_SOM:
1569 nigel 77 case OP_EOD:
1570     case OP_EODN:
1571     case OP_CIRC:
1572 ph10 602 case OP_CIRCM:
1573 nigel 77 case OP_DOLL:
1574 ph10 602 case OP_DOLLM:
1575 nigel 77 case OP_NOT_WORD_BOUNDARY:
1576     case OP_WORD_BOUNDARY:
1577     cc += _pcre_OP_lengths[*cc];
1578     break;
1579    
1580     /* Handle literal characters */
1581    
1582     case OP_CHAR:
1583 ph10 602 case OP_CHARI:
1584 nigel 91 case OP_NOT:
1585 ph10 604 case OP_NOTI:
1586 nigel 77 branchlength++;
1587     cc += 2;
1588     #ifdef SUPPORT_UTF8
1589 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1590 nigel 77 #endif
1591     break;
1592    
1593     /* Handle exact repetitions. The count is already in characters, but we
1594     need to skip over a multibyte character in UTF8 mode. */
1595    
1596     case OP_EXACT:
1597     branchlength += GET2(cc,1);
1598     cc += 4;
1599     #ifdef SUPPORT_UTF8
1600 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1601 nigel 77 #endif
1602     break;
1603    
1604     case OP_TYPEEXACT:
1605     branchlength += GET2(cc,1);
1606 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1607 nigel 77 cc += 4;
1608     break;
1609    
1610     /* Handle single-char matchers */
1611    
1612     case OP_PROP:
1613     case OP_NOTPROP:
1614 nigel 87 cc += 2;
1615 nigel 77 /* Fall through */
1616    
1617     case OP_NOT_DIGIT:
1618     case OP_DIGIT:
1619     case OP_NOT_WHITESPACE:
1620     case OP_WHITESPACE:
1621     case OP_NOT_WORDCHAR:
1622     case OP_WORDCHAR:
1623     case OP_ANY:
1624 ph10 342 case OP_ALLANY:
1625 nigel 77 branchlength++;
1626     cc++;
1627     break;
1628    
1629     /* The single-byte matcher isn't allowed */
1630    
1631     case OP_ANYBYTE:
1632     return -2;
1633    
1634     /* Check a class for variable quantification */
1635    
1636     #ifdef SUPPORT_UTF8
1637     case OP_XCLASS:
1638     cc += GET(cc, 1) - 33;
1639     /* Fall through */
1640     #endif
1641    
1642     case OP_CLASS:
1643     case OP_NCLASS:
1644     cc += 33;
1645    
1646     switch (*cc)
1647     {
1648     case OP_CRSTAR:
1649     case OP_CRMINSTAR:
1650     case OP_CRQUERY:
1651     case OP_CRMINQUERY:
1652     return -1;
1653    
1654     case OP_CRRANGE:
1655     case OP_CRMINRANGE:
1656     if (GET2(cc,1) != GET2(cc,3)) return -1;
1657     branchlength += GET2(cc,1);
1658     cc += 5;
1659     break;
1660    
1661     default:
1662     branchlength++;
1663     }
1664     break;
1665    
1666     /* Anything else is variable length */
1667    
1668     default:
1669     return -1;
1670     }
1671     }
1672     /* Control never gets here */
1673     }
1674    
1675    
1676    
1677    
1678     /*************************************************
1679 ph10 454 * Scan compiled regex for specific bracket *
1680 nigel 77 *************************************************/
1681    
1682     /* This little function scans through a compiled pattern until it finds a
1683 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1684 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1685     so that it can be called from pcre_study() when finding the minimum matching
1686 ph10 455 length.
1687 nigel 77
1688     Arguments:
1689     code points to start of expression
1690     utf8 TRUE in UTF-8 mode
1691 ph10 454 number the required bracket number or negative to find a lookbehind
1692 nigel 77
1693     Returns: pointer to the opcode for the bracket, or NULL if not found
1694     */
1695    
1696 ph10 455 const uschar *
1697     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1698 nigel 77 {
1699     for (;;)
1700     {
1701     register int c = *code;
1702 ph10 618
1703 nigel 77 if (c == OP_END) return NULL;
1704 nigel 91
1705     /* XCLASS is used for classes that cannot be represented just by a bit
1706     map. This includes negated single high-valued characters. The length in
1707     the table is zero; the actual length is stored in the compiled code. */
1708    
1709     if (c == OP_XCLASS) code += GET(code, 1);
1710 ph10 461
1711 ph10 454 /* Handle recursion */
1712 ph10 461
1713 ph10 454 else if (c == OP_REVERSE)
1714     {
1715 ph10 461 if (number < 0) return (uschar *)code;
1716 ph10 454 code += _pcre_OP_lengths[c];
1717     }
1718 nigel 91
1719 nigel 93 /* Handle capturing bracket */
1720 nigel 91
1721 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1722     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1723 nigel 77 {
1724 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1725 nigel 77 if (n == number) return (uschar *)code;
1726 nigel 93 code += _pcre_OP_lengths[c];
1727 nigel 77 }
1728 nigel 91
1729 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1730     repeated character types, we have to test for \p and \P, which have an extra
1731 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1732 ph10 510 must add in its length. */
1733 nigel 91
1734 nigel 77 else
1735     {
1736 ph10 218 switch(c)
1737     {
1738     case OP_TYPESTAR:
1739     case OP_TYPEMINSTAR:
1740     case OP_TYPEPLUS:
1741     case OP_TYPEMINPLUS:
1742     case OP_TYPEQUERY:
1743     case OP_TYPEMINQUERY:
1744     case OP_TYPEPOSSTAR:
1745     case OP_TYPEPOSPLUS:
1746     case OP_TYPEPOSQUERY:
1747     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1748 ph10 220 break;
1749 ph10 221
1750     case OP_TYPEUPTO:
1751     case OP_TYPEMINUPTO:
1752     case OP_TYPEEXACT:
1753     case OP_TYPEPOSUPTO:
1754     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1755     break;
1756 ph10 512
1757 ph10 510 case OP_MARK:
1758     case OP_PRUNE_ARG:
1759     case OP_SKIP_ARG:
1760     code += code[1];
1761 ph10 512 break;
1762 ph10 550
1763     case OP_THEN_ARG:
1764     code += code[1+LINK_SIZE];
1765     break;
1766 ph10 220 }
1767    
1768 ph10 218 /* Add in the fixed length from the table */
1769 ph10 220
1770 nigel 77 code += _pcre_OP_lengths[c];
1771 ph10 220
1772 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1773     a multi-byte character. The length in the table is a minimum, so we have to
1774     arrange to skip the extra bytes. */
1775 ph10 220
1776 ph10 107 #ifdef SUPPORT_UTF8
1777 nigel 77 if (utf8) switch(c)
1778     {
1779     case OP_CHAR:
1780 ph10 602 case OP_CHARI:
1781 nigel 77 case OP_EXACT:
1782 ph10 602 case OP_EXACTI:
1783 nigel 77 case OP_UPTO:
1784 ph10 602 case OP_UPTOI:
1785 nigel 77 case OP_MINUPTO:
1786 ph10 602 case OP_MINUPTOI:
1787 nigel 93 case OP_POSUPTO:
1788 ph10 602 case OP_POSUPTOI:
1789 nigel 77 case OP_STAR:
1790 ph10 602 case OP_STARI:
1791 nigel 77 case OP_MINSTAR:
1792 ph10 602 case OP_MINSTARI:
1793 nigel 93 case OP_POSSTAR:
1794 ph10 602 case OP_POSSTARI:
1795 nigel 77 case OP_PLUS:
1796 ph10 602 case OP_PLUSI:
1797 nigel 77 case OP_MINPLUS:
1798 ph10 602 case OP_MINPLUSI:
1799 nigel 93 case OP_POSPLUS:
1800 ph10 602 case OP_POSPLUSI:
1801 nigel 77 case OP_QUERY:
1802 ph10 602 case OP_QUERYI:
1803 nigel 77 case OP_MINQUERY:
1804 ph10 602 case OP_MINQUERYI:
1805 nigel 93 case OP_POSQUERY:
1806 ph10 602 case OP_POSQUERYI:
1807 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1808 nigel 77 break;
1809     }
1810 ph10 369 #else
1811     (void)(utf8); /* Keep compiler happy by referencing function argument */
1812 ph10 111 #endif
1813 nigel 77 }
1814     }
1815     }
1816    
1817    
1818    
1819     /*************************************************
1820     * Scan compiled regex for recursion reference *
1821     *************************************************/
1822    
1823     /* This little function scans through a compiled pattern until it finds an
1824     instance of OP_RECURSE.
1825    
1826     Arguments:
1827     code points to start of expression
1828     utf8 TRUE in UTF-8 mode
1829    
1830     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1831     */
1832    
1833     static const uschar *
1834     find_recurse(const uschar *code, BOOL utf8)
1835     {
1836     for (;;)
1837     {
1838     register int c = *code;
1839     if (c == OP_END) return NULL;
1840 nigel 91 if (c == OP_RECURSE) return code;
1841 ph10 220
1842 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1843     map. This includes negated single high-valued characters. The length in
1844     the table is zero; the actual length is stored in the compiled code. */
1845    
1846     if (c == OP_XCLASS) code += GET(code, 1);
1847    
1848 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1849     repeated character types, we have to test for \p and \P, which have an extra
1850 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1851 ph10 510 must add in its length. */
1852 nigel 91
1853 nigel 77 else
1854     {
1855 ph10 218 switch(c)
1856     {
1857     case OP_TYPESTAR:
1858     case OP_TYPEMINSTAR:
1859     case OP_TYPEPLUS:
1860     case OP_TYPEMINPLUS:
1861     case OP_TYPEQUERY:
1862     case OP_TYPEMINQUERY:
1863     case OP_TYPEPOSSTAR:
1864     case OP_TYPEPOSPLUS:
1865     case OP_TYPEPOSQUERY:
1866     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1867 ph10 220 break;
1868 ph10 221
1869     case OP_TYPEPOSUPTO:
1870     case OP_TYPEUPTO:
1871     case OP_TYPEMINUPTO:
1872     case OP_TYPEEXACT:
1873     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1874     break;
1875 ph10 512
1876 ph10 510 case OP_MARK:
1877     case OP_PRUNE_ARG:
1878     case OP_SKIP_ARG:
1879     code += code[1];
1880 ph10 512 break;
1881 ph10 550
1882     case OP_THEN_ARG:
1883     code += code[1+LINK_SIZE];
1884     break;
1885 ph10 220 }
1886    
1887 ph10 218 /* Add in the fixed length from the table */
1888    
1889 nigel 77 code += _pcre_OP_lengths[c];
1890 ph10 220
1891 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1892     by a multi-byte character. The length in the table is a minimum, so we have
1893     to arrange to skip the extra bytes. */
1894 ph10 220
1895 ph10 107 #ifdef SUPPORT_UTF8
1896 nigel 77 if (utf8) switch(c)
1897     {
1898     case OP_CHAR:
1899 ph10 602 case OP_CHARI:
1900 nigel 77 case OP_EXACT:
1901 ph10 602 case OP_EXACTI:
1902 nigel 77 case OP_UPTO:
1903 ph10 602 case OP_UPTOI:
1904 nigel 77 case OP_MINUPTO:
1905 ph10 602 case OP_MINUPTOI:
1906 nigel 93 case OP_POSUPTO:
1907 ph10 602 case OP_POSUPTOI:
1908 nigel 77 case OP_STAR:
1909 ph10 602 case OP_STARI:
1910 nigel 77 case OP_MINSTAR:
1911 ph10 602 case OP_MINSTARI:
1912 nigel 93 case OP_POSSTAR:
1913 ph10 602 case OP_POSSTARI:
1914 nigel 77 case OP_PLUS:
1915 ph10 602 case OP_PLUSI:
1916 nigel 77 case OP_MINPLUS:
1917 ph10 602 case OP_MINPLUSI:
1918 nigel 93 case OP_POSPLUS:
1919 ph10 602 case OP_POSPLUSI:
1920 nigel 77 case OP_QUERY:
1921 ph10 602 case OP_QUERYI:
1922 nigel 77 case OP_MINQUERY:
1923 ph10 602 case OP_MINQUERYI:
1924 nigel 93 case OP_POSQUERY:
1925 ph10 602 case OP_POSQUERYI:
1926 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1927 nigel 77 break;
1928     }
1929 ph10 369 #else
1930     (void)(utf8); /* Keep compiler happy by referencing function argument */
1931 ph10 111 #endif
1932 nigel 77 }
1933     }
1934     }
1935    
1936    
1937    
1938     /*************************************************
1939     * Scan compiled branch for non-emptiness *
1940     *************************************************/
1941    
1942     /* This function scans through a branch of a compiled pattern to see whether it
1943 nigel 93 can match the empty string or not. It is called from could_be_empty()
1944     below and from compile_branch() when checking for an unlimited repeat of a
1945     group that can match nothing. Note that first_significant_code() skips over
1946 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1947     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1948     bracket whose current branch will already have been scanned.
1949 nigel 77
1950     Arguments:
1951     code points to start of search
1952     endcode points to where to stop
1953     utf8 TRUE if in UTF8 mode
1954 ph10 503 cd contains pointers to tables etc.
1955 nigel 77
1956     Returns: TRUE if what is matched could be empty
1957     */
1958    
1959     static BOOL
1960 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1961     compile_data *cd)
1962 nigel 77 {
1963     register int c;
1964 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1965 nigel 77 code < endcode;
1966 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1967 nigel 77 {
1968     const uschar *ccode;
1969    
1970     c = *code;
1971 ph10 507
1972 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1973 ph10 282 first_significant_code() with a TRUE final argument. */
1974 ph10 286
1975 ph10 282 if (c == OP_ASSERT)
1976 ph10 286 {
1977 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1978     c = *code;
1979     continue;
1980 ph10 286 }
1981 ph10 172
1982 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1983 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
1984     forward reference subroutine call, we can't. To detect forward reference
1985     we have to scan up the list that is kept in the workspace. This function is
1986     called only when doing the real compile, not during the pre-compile that
1987     measures the size of the compiled pattern. */
1988 ph10 507
1989 ph10 503 if (c == OP_RECURSE)
1990     {
1991 ph10 624 const uschar *scode;
1992     BOOL empty_branch;
1993    
1994     /* Test for forward reference */
1995    
1996     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1997     if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1998    
1999     /* Not a forward reference, test for completed backward reference */
2000    
2001     empty_branch = FALSE;
2002     scode = cd->start_code + GET(code, 1);
2003 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2004 ph10 624
2005     /* Completed backwards reference */
2006    
2007 ph10 503 do
2008     {
2009 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2010     {
2011     empty_branch = TRUE;
2012 ph10 507 break;
2013     }
2014 ph10 503 scode += GET(scode, 1);
2015     }
2016     while (*scode == OP_ALT);
2017 ph10 624
2018 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2019 ph10 503 continue;
2020 ph10 507 }
2021 ph10 170
2022 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2023    
2024     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2025     c == OP_BRAPOSZERO)
2026     {
2027     code += _pcre_OP_lengths[c];
2028     do code += GET(code, 1); while (*code == OP_ALT);
2029     c = *code;
2030     continue;
2031     }
2032    
2033     /* A nested group that is already marked as "could be empty" can just be
2034     skipped. */
2035    
2036     if (c == OP_SBRA || c == OP_SBRAPOS ||
2037     c == OP_SCBRA || c == OP_SCBRAPOS)
2038     {
2039     do code += GET(code, 1); while (*code == OP_ALT);
2040     c = *code;
2041     continue;
2042     }
2043    
2044 ph10 170 /* For other groups, scan the branches. */
2045 ph10 172
2046 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2047     c == OP_CBRA || c == OP_CBRAPOS ||
2048     c == OP_ONCE || c == OP_COND)
2049 nigel 77 {
2050     BOOL empty_branch;
2051     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2052 ph10 406
2053     /* If a conditional group has only one branch, there is a second, implied,
2054 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2055     Otherwise, scan the individual branches of the group. */
2056 ph10 406
2057 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2058 nigel 77 code += GET(code, 1);
2059 ph10 395 else
2060 ph10 406 {
2061 ph10 395 empty_branch = FALSE;
2062     do
2063     {
2064 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2065 ph10 395 empty_branch = TRUE;
2066     code += GET(code, 1);
2067     }
2068     while (*code == OP_ALT);
2069     if (!empty_branch) return FALSE; /* All branches are non-empty */
2070 nigel 77 }
2071 ph10 406
2072 ph10 172 c = *code;
2073 nigel 93 continue;
2074 nigel 77 }
2075    
2076 nigel 93 /* Handle the other opcodes */
2077    
2078     switch (c)
2079 nigel 77 {
2080 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2081     cannot be represented just by a bit map. This includes negated single
2082     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2083 ph10 220 actual length is stored in the compiled code, so we must update "code"
2084 ph10 216 here. */
2085 nigel 77
2086     #ifdef SUPPORT_UTF8
2087     case OP_XCLASS:
2088 ph10 216 ccode = code += GET(code, 1);
2089 nigel 77 goto CHECK_CLASS_REPEAT;
2090     #endif
2091    
2092     case OP_CLASS:
2093     case OP_NCLASS:
2094     ccode = code + 33;
2095    
2096     #ifdef SUPPORT_UTF8
2097     CHECK_CLASS_REPEAT:
2098     #endif
2099    
2100     switch (*ccode)
2101     {
2102     case OP_CRSTAR: /* These could be empty; continue */
2103     case OP_CRMINSTAR:
2104     case OP_CRQUERY:
2105     case OP_CRMINQUERY:
2106     break;
2107    
2108     default: /* Non-repeat => class must match */
2109     case OP_CRPLUS: /* These repeats aren't empty */
2110     case OP_CRMINPLUS:
2111     return FALSE;
2112    
2113     case OP_CRRANGE:
2114     case OP_CRMINRANGE:
2115     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2116     break;
2117     }
2118     break;
2119    
2120     /* Opcodes that must match a character */
2121    
2122     case OP_PROP:
2123     case OP_NOTPROP:
2124     case OP_EXTUNI:
2125     case OP_NOT_DIGIT:
2126     case OP_DIGIT:
2127     case OP_NOT_WHITESPACE:
2128     case OP_WHITESPACE:
2129     case OP_NOT_WORDCHAR:
2130     case OP_WORDCHAR:
2131     case OP_ANY:
2132 ph10 345 case OP_ALLANY:
2133 nigel 77 case OP_ANYBYTE:
2134     case OP_CHAR:
2135 ph10 602 case OP_CHARI:
2136 nigel 77 case OP_NOT:
2137 ph10 602 case OP_NOTI:
2138 nigel 77 case OP_PLUS:
2139     case OP_MINPLUS:
2140 nigel 93 case OP_POSPLUS:
2141 nigel 77 case OP_EXACT:
2142     case OP_NOTPLUS:
2143     case OP_NOTMINPLUS:
2144 nigel 93 case OP_NOTPOSPLUS:
2145 nigel 77 case OP_NOTEXACT:
2146     case OP_TYPEPLUS:
2147     case OP_TYPEMINPLUS:
2148 nigel 93 case OP_TYPEPOSPLUS:
2149 nigel 77 case OP_TYPEEXACT:
2150     return FALSE;
2151 ph10 227
2152     /* These are going to continue, as they may be empty, but we have to
2153     fudge the length for the \p and \P cases. */
2154    
2155 ph10 224 case OP_TYPESTAR:
2156     case OP_TYPEMINSTAR:
2157     case OP_TYPEPOSSTAR:
2158     case OP_TYPEQUERY:
2159     case OP_TYPEMINQUERY:
2160     case OP_TYPEPOSQUERY:
2161     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2162 ph10 227 break;
2163    
2164 ph10 224 /* Same for these */
2165 ph10 227
2166 ph10 224 case OP_TYPEUPTO:
2167     case OP_TYPEMINUPTO:
2168     case OP_TYPEPOSUPTO:
2169     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2170     break;
2171 nigel 77
2172     /* End of branch */
2173    
2174     case OP_KET:
2175     case OP_KETRMAX:
2176     case OP_KETRMIN:
2177 ph10 604 case OP_KETRPOS:
2178 nigel 77 case OP_ALT:
2179     return TRUE;
2180    
2181 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2182     MINUPTO, and POSUPTO may be followed by a multibyte character */
2183 nigel 77
2184     #ifdef SUPPORT_UTF8
2185     case OP_STAR:
2186 ph10 602 case OP_STARI:
2187 nigel 77 case OP_MINSTAR:
2188 ph10 602 case OP_MINSTARI:
2189 nigel 93 case OP_POSSTAR:
2190 ph10 602 case OP_POSSTARI:
2191 nigel 77 case OP_QUERY:
2192 ph10 602 case OP_QUERYI:
2193 nigel 77 case OP_MINQUERY:
2194 ph10 602 case OP_MINQUERYI:
2195 nigel 93 case OP_POSQUERY:
2196 ph10 602 case OP_POSQUERYI:
2197 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2198     break;
2199 ph10 461
2200 nigel 77 case OP_UPTO:
2201 ph10 602 case OP_UPTOI:
2202 nigel 77 case OP_MINUPTO:
2203 ph10 602 case OP_MINUPTOI:
2204 nigel 93 case OP_POSUPTO:
2205 ph10 602 case OP_POSUPTOI:
2206 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2207 nigel 77 break;
2208     #endif
2209 ph10 503
2210 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2211     string. */
2212    
2213     case OP_MARK:
2214     case OP_PRUNE_ARG:
2215     case OP_SKIP_ARG:
2216     code += code[1];
2217 ph10 512 break;
2218 ph10 510
2219 ph10 550 case OP_THEN_ARG:
2220     code += code[1+LINK_SIZE];
2221     break;
2222    
2223 ph10 503 /* None of the remaining opcodes are required to match a character. */
2224 ph10 507
2225 ph10 503 default:
2226 ph10 507 break;
2227 nigel 77 }
2228     }
2229    
2230     return TRUE;
2231     }
2232    
2233    
2234    
2235     /*************************************************
2236     * Scan compiled regex for non-emptiness *
2237     *************************************************/
2238    
2239     /* This function is called to check for left recursive calls. We want to check
2240     the current branch of the current pattern to see if it could match the empty
2241     string. If it could, we must look outwards for branches at other levels,
2242     stopping when we pass beyond the bracket which is the subject of the recursion.
2243 ph10 624 This function is called only during the real compile, not during the
2244     pre-compile.
2245 nigel 77
2246     Arguments:
2247     code points to start of the recursion
2248     endcode points to where to stop (current RECURSE item)
2249     bcptr points to the chain of current (unclosed) branch starts
2250     utf8 TRUE if in UTF-8 mode
2251 ph10 507 cd pointers to tables etc
2252 nigel 77
2253     Returns: TRUE if what is matched could be empty
2254     */
2255    
2256     static BOOL
2257     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2258 ph10 503 BOOL utf8, compile_data *cd)
2259 nigel 77 {
2260 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2261 nigel 77 {
2262 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2263 ph10 475 return FALSE;
2264 nigel 77 bcptr = bcptr->outer;
2265     }
2266     return TRUE;
2267     }
2268    
2269    
2270    
2271     /*************************************************
2272     * Check for POSIX class syntax *
2273     *************************************************/
2274    
2275     /* This function is called when the sequence "[:" or "[." or "[=" is
2276 ph10 295 encountered in a character class. It checks whether this is followed by a
2277 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2278 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2279 nigel 77
2280 ph10 298 Originally, this function only recognized a sequence of letters between the
2281     terminators, but it seems that Perl recognizes any sequence of characters,
2282     though of course unknown POSIX names are subsequently rejected. Perl gives an
2283     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2284     didn't consider this to be a POSIX class. Likewise for [:1234:].
2285 ph10 295
2286 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2287     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2288     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2289     below handles the special case of \], but does not try to do any other escape
2290     processing. This makes it different from Perl for cases such as [:l\ower:]
2291 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2292 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2293 ph10 295 I think.
2294    
2295 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2296     It seems that the appearance of a nested POSIX class supersedes an apparent
2297     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2298     a digit. Also, unescaped square brackets may also appear as part of class
2299     names. For example, [:a[:abc]b:] gives unknown class "[:abc]b:]"in Perl.
2300    
2301 ph10 295 Arguments:
2302 nigel 77 ptr pointer to the initial [
2303     endptr where to return the end pointer
2304    
2305     Returns: TRUE or FALSE
2306     */
2307    
2308     static BOOL
2309 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2310 nigel 77 {
2311     int terminator; /* Don't combine these lines; the Solaris cc */
2312     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2313 ph10 295 for (++ptr; *ptr != 0; ptr++)
2314 nigel 77 {
2315 ph10 640 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2316     ptr++;
2317     else
2318 ph10 298 {
2319 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2320 ph10 295 {
2321     *endptr = ptr;
2322     return TRUE;
2323 ph10 298 }
2324 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2325     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2326     ptr[1] == CHAR_EQUALS_SIGN) &&
2327     check_posix_syntax(ptr, endptr))
2328     return FALSE;
2329 ph10 298 }
2330     }
2331 nigel 77 return FALSE;
2332     }
2333    
2334    
2335    
2336    
2337     /*************************************************
2338     * Check POSIX class name *
2339     *************************************************/
2340    
2341     /* This function is called to check the name given in a POSIX-style class entry
2342     such as [:alnum:].
2343    
2344     Arguments:
2345     ptr points to the first letter
2346     len the length of the name
2347    
2348     Returns: a value representing the name, or -1 if unknown
2349     */
2350    
2351     static int
2352     check_posix_name(const uschar *ptr, int len)
2353     {
2354 ph10 240 const char *pn = posix_names;
2355 nigel 77 register int yield = 0;
2356     while (posix_name_lengths[yield] != 0)
2357     {
2358     if (len == posix_name_lengths[yield] &&
2359 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2360 ph10 243 pn += posix_name_lengths[yield] + 1;
2361 nigel 77 yield++;
2362     }
2363     return -1;
2364     }
2365    
2366    
2367     /*************************************************
2368     * Adjust OP_RECURSE items in repeated group *
2369     *************************************************/
2370    
2371     /* OP_RECURSE items contain an offset from the start of the regex to the group
2372     that is referenced. This means that groups can be replicated for fixed
2373     repetition simply by copying (because the recursion is allowed to refer to
2374     earlier groups that are outside the current group). However, when a group is
2375 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2376     inserted before it, after it has been compiled. This means that any OP_RECURSE
2377     items within it that refer to the group itself or any contained groups have to
2378     have their offsets adjusted. That one of the jobs of this function. Before it
2379     is called, the partially compiled regex must be temporarily terminated with
2380     OP_END.
2381 nigel 77
2382 nigel 93 This function has been extended with the possibility of forward references for
2383     recursions and subroutine calls. It must also check the list of such references
2384     for the group we are dealing with. If it finds that one of the recursions in
2385     the current group is on this list, it adjusts the offset in the list, not the
2386     value in the reference (which is a group number).
2387    
2388 nigel 77 Arguments:
2389     group points to the start of the group
2390     adjust the amount by which the group is to be moved
2391     utf8 TRUE in UTF-8 mode
2392     cd contains pointers to tables etc.
2393 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2394 nigel 77
2395     Returns: nothing
2396     */
2397    
2398     static void
2399 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2400     uschar *save_hwm)
2401 nigel 77 {
2402     uschar *ptr = group;
2403 ph10 224
2404 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2405     {
2406 nigel 93 int offset;
2407     uschar *hc;
2408    
2409     /* See if this recursion is on the forward reference list. If so, adjust the
2410     reference. */
2411 ph10 345
2412 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2413     {
2414     offset = GET(hc, 0);
2415     if (cd->start_code + offset == ptr + 1)
2416     {
2417     PUT(hc, 0, offset + adjust);
2418     break;
2419     }
2420     }
2421    
2422     /* Otherwise, adjust the recursion offset if it's after the start of this
2423     group. */
2424    
2425     if (hc >= cd->hwm)
2426     {
2427     offset = GET(ptr, 1);
2428     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2429     }
2430    
2431 nigel 77 ptr += 1 + LINK_SIZE;
2432     }
2433     }
2434    
2435    
2436    
2437     /*************************************************
2438     * Insert an automatic callout point *
2439     *************************************************/
2440    
2441     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2442     callout points before each pattern item.
2443    
2444     Arguments:
2445     code current code pointer
2446     ptr current pattern pointer
2447     cd pointers to tables etc
2448    
2449     Returns: new code pointer
2450     */
2451    
2452     static uschar *
2453     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2454     {
2455     *code++ = OP_CALLOUT;
2456     *code++ = 255;
2457 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2458     PUT(code, LINK_SIZE, 0); /* Default length */
2459 nigel 77 return code + 2*LINK_SIZE;
2460     }
2461    
2462    
2463    
2464     /*************************************************
2465     * Complete a callout item *
2466     *************************************************/
2467    
2468     /* A callout item contains the length of the next item in the pattern, which
2469     we can't fill in till after we have reached the relevant point. This is used
2470     for both automatic and manual callouts.
2471    
2472     Arguments:
2473     previous_callout points to previous callout item
2474     ptr current pattern pointer
2475     cd pointers to tables etc
2476    
2477     Returns: nothing
2478     */
2479    
2480     static void
2481     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2482     {
2483 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2484 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2485     }
2486    
2487    
2488    
2489     #ifdef SUPPORT_UCP
2490     /*************************************************
2491     * Get othercase range *
2492     *************************************************/
2493    
2494     /* This function is passed the start and end of a class range, in UTF-8 mode
2495     with UCP support. It searches up the characters, looking for internal ranges of
2496     characters in the "other" case. Each call returns the next one, updating the
2497     start address.
2498    
2499     Arguments:
2500     cptr points to starting character value; updated
2501     d end value
2502     ocptr where to put start of othercase range
2503     odptr where to put end of othercase range
2504    
2505     Yield: TRUE when range returned; FALSE when no more
2506     */
2507    
2508     static BOOL
2509 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2510     unsigned int *odptr)
2511 nigel 77 {
2512 nigel 93 unsigned int c, othercase, next;
2513 nigel 77
2514     for (c = *cptr; c <= d; c++)
2515 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2516 nigel 77
2517     if (c > d) return FALSE;
2518    
2519     *ocptr = othercase;
2520     next = othercase + 1;
2521    
2522     for (++c; c <= d; c++)
2523     {
2524 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2525 nigel 77 next++;
2526     }
2527    
2528     *odptr = next - 1;
2529     *cptr = c;
2530    
2531     return TRUE;
2532     }
2533 ph10 532
2534    
2535    
2536     /*************************************************
2537     * Check a character and a property *
2538     *************************************************/
2539    
2540     /* This function is called by check_auto_possessive() when a property item
2541     is adjacent to a fixed character.
2542    
2543     Arguments:
2544     c the character
2545     ptype the property type
2546     pdata the data for the type
2547     negated TRUE if it's a negated property (\P or \p{^)
2548 ph10 535
2549 ph10 532 Returns: TRUE if auto-possessifying is OK
2550 ph10 535 */
2551 ph10 532
2552     static BOOL
2553     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2554     {
2555     const ucd_record *prop = GET_UCD(c);
2556     switch(ptype)
2557     {
2558     case PT_LAMP:
2559     return (prop->chartype == ucp_Lu ||
2560     prop->chartype == ucp_Ll ||
2561     prop->chartype == ucp_Lt) == negated;
2562    
2563     case PT_GC:
2564     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2565    
2566     case PT_PC:
2567     return (pdata == prop->chartype) == negated;
2568    
2569     case PT_SC:
2570     return (pdata == prop->script) == negated;
2571    
2572     /* These are specials */
2573    
2574     case PT_ALNUM:
2575     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2576     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2577    
2578     case PT_SPACE: /* Perl space */
2579     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2580     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2581     == negated;
2582    
2583     case PT_PXSPACE: /* POSIX space */
2584     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2585     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2586     c == CHAR_FF || c == CHAR_CR)
2587     == negated;
2588    
2589     case PT_WORD:
2590     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2591     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2592     c == CHAR_UNDERSCORE) == negated;
2593     }
2594 ph10 535 return FALSE;
2595 ph10 532 }
2596 nigel 77 #endif /* SUPPORT_UCP */
2597    
2598    
2599 nigel 93
2600 nigel 77 /*************************************************
2601 nigel 93 * Check if auto-possessifying is possible *
2602     *************************************************/
2603    
2604     /* This function is called for unlimited repeats of certain items, to see
2605     whether the next thing could possibly match the repeated item. If not, it makes
2606     sense to automatically possessify the repeated item.
2607    
2608     Arguments:
2609 ph10 532 previous pointer to the repeated opcode
2610 nigel 93 utf8 TRUE in UTF-8 mode
2611     ptr next character in pattern
2612     options options bits
2613     cd contains pointers to tables etc.
2614    
2615     Returns: TRUE if possessifying is wanted
2616     */
2617    
2618     static BOOL
2619 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2620 ph10 532 int options, compile_data *cd)
2621 nigel 93 {
2622 ph10 532 int c, next;
2623     int op_code = *previous++;
2624 nigel 93
2625     /* Skip whitespace and comments in extended mode */
2626    
2627     if ((options & PCRE_EXTENDED) != 0)
2628     {
2629     for (;;)
2630     {
2631     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2632 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2633 nigel 93 {
2634 ph10 579 ptr++;
2635 ph10 556 while (*ptr != 0)
2636     {
2637 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2638 ph10 556 ptr++;
2639 ph10 579 #ifdef SUPPORT_UTF8
2640 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2641     #endif
2642     }
2643 nigel 93 }
2644     else break;
2645     }
2646     }
2647    
2648     /* If the next item is one that we can handle, get its value. A non-negative
2649     value is a character, a negative value is an escape value. */
2650    
2651 ph10 391 if (*ptr == CHAR_BACKSLASH)
2652 nigel 93 {
2653     int temperrorcode = 0;
2654     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2655     if (temperrorcode != 0) return FALSE;
2656     ptr++; /* Point after the escape sequence */
2657     }
2658    
2659     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2660     {
2661     #ifdef SUPPORT_UTF8
2662     if (utf8) { GETCHARINC(next, ptr); } else
2663     #endif
2664     next = *ptr++;
2665     }
2666    
2667     else return FALSE;
2668    
2669     /* Skip whitespace and comments in extended mode */
2670    
2671     if ((options & PCRE_EXTENDED) != 0)
2672     {
2673     for (;;)
2674     {
2675     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2676 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2677 nigel 93 {
2678 ph10 579 ptr++;
2679 ph10 556 while (*ptr != 0)
2680     {
2681 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2682 ph10 556 ptr++;
2683 ph10 579 #ifdef SUPPORT_UTF8
2684 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2685     #endif
2686     }
2687 nigel 93 }
2688     else break;
2689     }
2690     }
2691    
2692     /* If the next thing is itself optional, we have to give up. */
2693    
2694 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2695 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2696     return FALSE;
2697 nigel 93
2698 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2699     the next item is a character. */
2700 nigel 93
2701     if (next >= 0) switch(op_code)
2702     {
2703     case OP_CHAR:
2704 ph10 535 #ifdef SUPPORT_UTF8
2705 ph10 532 GETCHARTEST(c, previous);
2706 ph10 369 #else
2707 ph10 532 c = *previous;
2708 ph10 535 #endif
2709     return c != next;
2710 nigel 93
2711 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2712 nigel 93 Unicode property support, we can use it to test the other case of
2713     high-valued characters. */
2714    
2715 ph10 602 case OP_CHARI:
2716 ph10 535 #ifdef SUPPORT_UTF8
2717 ph10 532 GETCHARTEST(c, previous);
2718     #else
2719     c = *previous;
2720 ph10 535 #endif
2721 ph10 532 if (c == next) return FALSE;
2722 nigel 93 #ifdef SUPPORT_UTF8
2723     if (utf8)
2724     {
2725     unsigned int othercase;
2726     if (next < 128) othercase = cd->fcc[next]; else
2727     #ifdef SUPPORT_UCP
2728 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2729 nigel 93 #else
2730     othercase = NOTACHAR;
2731     #endif
2732 ph10 532 return (unsigned int)c != othercase;
2733 nigel 93 }
2734     else
2735     #endif /* SUPPORT_UTF8 */
2736 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2737 nigel 93
2738 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2739 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2740 ph10 602 an XCLASS instead. */
2741 nigel 93
2742     case OP_NOT:
2743 ph10 602 return (c = *previous) == next;
2744 ph10 604
2745     case OP_NOTI:
2746 ph10 532 if ((c = *previous) == next) return TRUE;
2747 nigel 93 #ifdef SUPPORT_UTF8
2748     if (utf8)
2749     {
2750     unsigned int othercase;
2751     if (next < 128) othercase = cd->fcc[next]; else
2752     #ifdef SUPPORT_UCP
2753 ph10 349 othercase = UCD_OTHERCASE(next);
2754 nigel 93 #else
2755     othercase = NOTACHAR;
2756     #endif
2757 ph10 532 return (unsigned int)c == othercase;
2758 nigel 93 }
2759     else
2760     #endif /* SUPPORT_UTF8 */
2761 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2762 nigel 93
2763 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2764     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2765    
2766 nigel 93 case OP_DIGIT:
2767     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2768    
2769     case OP_NOT_DIGIT:
2770     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2771    
2772     case OP_WHITESPACE:
2773     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2774    
2775     case OP_NOT_WHITESPACE:
2776     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2777    
2778     case OP_WORDCHAR:
2779     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2780    
2781     case OP_NOT_WORDCHAR:
2782     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2783    
2784 ph10 180 case OP_HSPACE:
2785     case OP_NOT_HSPACE:
2786     switch(next)
2787     {
2788     case 0x09:
2789     case 0x20:
2790     case 0xa0:
2791     case 0x1680:
2792     case 0x180e:
2793     case 0x2000:
2794     case 0x2001:
2795     case 0x2002:
2796     case 0x2003:
2797     case 0x2004:
2798     case 0x2005:
2799     case 0x2006:
2800     case 0x2007:
2801     case 0x2008:
2802     case 0x2009:
2803     case 0x200A:
2804     case 0x202f:
2805     case 0x205f:
2806     case 0x3000:
2807 ph10 528 return op_code == OP_NOT_HSPACE;
2808 ph10 180 default:
2809 ph10 528 return op_code != OP_NOT_HSPACE;
2810 ph10 180 }
2811    
2812 ph10 528 case OP_ANYNL:
2813 ph10 180 case OP_VSPACE:
2814     case OP_NOT_VSPACE:
2815     switch(next)
2816     {
2817     case 0x0a:
2818     case 0x0b:
2819     case 0x0c:
2820     case 0x0d:
2821     case 0x85:
2822     case 0x2028:
2823     case 0x2029:
2824 ph10 528 return op_code == OP_NOT_VSPACE;
2825 ph10 180 default:
2826 ph10 528 return op_code != OP_NOT_VSPACE;
2827 ph10 180 }
2828    
2829 ph10 532 #ifdef SUPPORT_UCP
2830     case OP_PROP:
2831     return check_char_prop(next, previous[0], previous[1], FALSE);
2832 ph10 535
2833 ph10 532 case OP_NOTPROP:
2834     return check_char_prop(next, previous[0], previous[1], TRUE);
2835     #endif
2836    
2837 nigel 93 default:
2838     return FALSE;
2839     }
2840    
2841    
2842 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2843     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2844     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2845     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2846 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2847 nigel 93
2848     switch(op_code)
2849     {
2850     case OP_CHAR:
2851 ph10 602 case OP_CHARI:
2852 ph10 535 #ifdef SUPPORT_UTF8
2853 ph10 532 GETCHARTEST(c, previous);
2854     #else
2855     c = *previous;
2856 ph10 535 #endif
2857 nigel 93 switch(-next)
2858     {
2859     case ESC_d:
2860 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2861 nigel 93
2862     case ESC_D:
2863 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2864 nigel 93
2865     case ESC_s:
2866 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2867 nigel 93
2868     case ESC_S:
2869 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2870 nigel 93
2871     case ESC_w:
2872 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2873 nigel 93
2874     case ESC_W:
2875 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2876 ph10 182
2877 ph10 180 case ESC_h:
2878     case ESC_H:
2879 ph10 532 switch(c)
2880 ph10 180 {
2881     case 0x09:
2882     case 0x20:
2883     case 0xa0:
2884     case 0x1680:
2885     case 0x180e:
2886     case 0x2000:
2887     case 0x2001:
2888     case 0x2002:
2889     case 0x2003:
2890     case 0x2004:
2891     case 0x2005:
2892     case 0x2006:
2893     case 0x2007:
2894     case 0x2008:
2895     case 0x2009:
2896     case 0x200A:
2897     case 0x202f:
2898     case 0x205f:
2899     case 0x3000:
2900     return -next != ESC_h;
2901     default:
2902     return -next == ESC_h;
2903 ph10 182 }
2904    
2905 ph10 180 case ESC_v:
2906     case ESC_V:
2907 ph10 532 switch(c)
2908 ph10 180 {
2909     case 0x0a:
2910     case 0x0b:
2911     case 0x0c:
2912     case 0x0d:
2913     case 0x85:
2914     case 0x2028:
2915     case 0x2029:
2916     return -next != ESC_v;
2917     default:
2918     return -next == ESC_v;
2919 ph10 182 }
2920 ph10 535
2921     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2922     their substitutions and process them. The result will always be either
2923 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2924 ph10 535
2925 ph10 532 #ifdef SUPPORT_UCP
2926     case ESC_du:
2927     case ESC_DU:
2928     case ESC_wu:
2929     case ESC_WU:
2930     case ESC_su:
2931     case ESC_SU:
2932     {
2933     int temperrorcode = 0;
2934     ptr = substitutes[-next - ESC_DU];
2935     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2936     if (temperrorcode != 0) return FALSE;
2937     ptr++; /* For compatibility */
2938     }
2939 ph10 535 /* Fall through */
2940 nigel 93
2941 ph10 532 case ESC_p:
2942     case ESC_P:
2943     {
2944     int ptype, pdata, errorcodeptr;
2945 ph10 535 BOOL negated;
2946    
2947 ph10 532 ptr--; /* Make ptr point at the p or P */
2948     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2949     if (ptype < 0) return FALSE;
2950     ptr++; /* Point past the final curly ket */
2951 ph10 535
2952 ph10 532 /* If the property item is optional, we have to give up. (When generated
2953     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2954     to the original \d etc. At this point, ptr will point to a zero byte. */
2955 ph10 535
2956 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2957     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2958     return FALSE;
2959 ph10 535
2960 ph10 532 /* Do the property check. */
2961 ph10 535
2962 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2963 ph10 535 }
2964 ph10 532 #endif
2965    
2966 nigel 93 default:
2967     return FALSE;
2968     }
2969    
2970 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2971     well. It means re-organizing the above code so as to get hold of the property
2972     values before switching on the op-code. However, I wonder how many patterns
2973     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2974     these op-codes are never generated.) */
2975    
2976 nigel 93 case OP_DIGIT:
2977 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2978 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2979 nigel 93
2980     case OP_NOT_DIGIT:
2981     return next == -ESC_d;
2982    
2983     case OP_WHITESPACE:
2984 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2985 nigel 93
2986     case OP_NOT_WHITESPACE:
2987 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2988 nigel 93
2989 ph10 180 case OP_HSPACE:
2990 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2991 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2992 ph10 180
2993     case OP_NOT_HSPACE:
2994     return next == -ESC_h;
2995 ph10 182
2996 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2997 ph10 535 case OP_ANYNL:
2998 ph10 182 case OP_VSPACE:
2999 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3000    
3001     case OP_NOT_VSPACE:
3002 ph10 528 return next == -ESC_v || next == -ESC_R;
3003 ph10 180
3004 nigel 93 case OP_WORDCHAR:
3005 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3006 ph10 528 next == -ESC_v || next == -ESC_R;
3007 nigel 93
3008     case OP_NOT_WORDCHAR:
3009     return next == -ESC_w || next == -ESC_d;
3010 ph10 182
3011 nigel 93 default:
3012     return FALSE;
3013     }
3014    
3015     /* Control does not reach here */
3016     }
3017    
3018    
3019    
3020     /*************************************************
3021 nigel 77 * Compile one branch *
3022     *************************************************/
3023    
3024 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3025 nigel 77 changed during the branch, the pointer is used to change the external options
3026 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3027     to find out the amount of memory needed, as well as during the real compile
3028     phase. The value of lengthptr distinguishes the two phases.
3029 nigel 77
3030     Arguments:
3031     optionsptr pointer to the option bits
3032     codeptr points to the pointer to the current code point
3033     ptrptr points to the current pattern pointer
3034     errorcodeptr points to error code variable
3035     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3036     reqbyteptr set to the last literal character required, else < 0
3037     bcptr points to current branch chain
3038 ph10 642 cond_depth conditional nesting depth
3039 nigel 77 cd contains pointers to tables etc.
3040 nigel 93 lengthptr NULL during the real compile phase
3041     points to length accumulator during pre-compile phase
3042 nigel 77
3043     Returns: TRUE on success
3044     FALSE, with *errorcodeptr set non-zero on error
3045     */
3046    
3047     static BOOL
3048 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3049     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3050 ph10 642 int cond_depth, compile_data *cd, int *lengthptr)
3051 nigel 77 {
3052     int repeat_type, op_type;
3053     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3054     int bravalue = 0;
3055     int greedy_default, greedy_non_default;
3056     int firstbyte, reqbyte;
3057     int zeroreqbyte, zerofirstbyte;
3058     int req_caseopt, reqvary, tempreqvary;
3059 ph10 635 int options = *optionsptr; /* May change dynamically */
3060 nigel 77 int after_manual_callout = 0;
3061 nigel 93 int length_prevgroup = 0;
3062 nigel 77 register int c;
3063     register uschar *code = *codeptr;
3064 nigel 93 uschar *last_code = code;
3065     uschar *orig_code = code;
3066 nigel 77 uschar *tempcode;
3067     BOOL inescq = FALSE;
3068     BOOL groupsetfirstbyte = FALSE;
3069     const uschar *ptr = *ptrptr;
3070     const uschar *tempptr;
3071 ph10 518 const uschar *nestptr = NULL;
3072 nigel 77 uschar *previous = NULL;
3073     uschar *previous_callout = NULL;
3074 nigel 93 uschar *save_hwm = NULL;
3075 nigel 77 uschar classbits[32];
3076    
3077 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3078     must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3079     dynamically as we process the pattern. */
3080    
3081 nigel 77 #ifdef SUPPORT_UTF8
3082     BOOL class_utf8;
3083     BOOL utf8 = (options & PCRE_UTF8) != 0;
3084     uschar *class_utf8data;
3085 ph10 300 uschar *class_utf8data_base;
3086 nigel 77 uschar utf8_char[6];
3087     #else
3088     BOOL utf8 = FALSE;
3089 nigel 93 uschar *utf8_char = NULL;
3090 nigel 77 #endif
3091    
3092 ph10 475 #ifdef PCRE_DEBUG
3093 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3094     #endif
3095    
3096 nigel 77 /* Set up the default and non-default settings for greediness */
3097    
3098     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3099     greedy_non_default = greedy_default ^ 1;
3100    
3101     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3102     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3103     matches a non-fixed char first char; reqbyte just remains unset if we never
3104     find one.
3105    
3106     When we hit a repeat whose minimum is zero, we may have to adjust these values
3107     to take the zero repeat into account. This is implemented by setting them to
3108     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3109     item types that can be repeated set these backoff variables appropriately. */
3110    
3111     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3112    
3113     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3114     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3115     value > 255. It is added into the firstbyte or reqbyte variables to record the
3116     case status of the value. This is used only for ASCII characters. */
3117    
3118     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3119    
3120     /* Switch on next character until the end of the branch */
3121    
3122     for (;; ptr++)
3123     {
3124     BOOL negate_class;
3125 ph10 286 BOOL should_flip_negation;
3126 nigel 77 BOOL possessive_quantifier;
3127     BOOL is_quantifier;
3128 nigel 93 BOOL is_recurse;
3129 ph10 180 BOOL reset_bracount;
3130 nigel 77 int class_charcount;
3131     int class_lastchar;
3132     int newoptions;
3133     int recno;
3134 ph10 172 int refsign;
3135 nigel 77 int skipbytes;
3136     int subreqbyte;
3137     int subfirstbyte;
3138 nigel 93 int terminator;
3139 nigel 77 int mclength;
3140     uschar mcbuffer[8];
3141    
3142 nigel 93 /* Get next byte in the pattern */
3143 nigel 77
3144     c = *ptr;
3145 ph10 345
3146 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3147 ph10 518 string. Nesting only happens one level deep. */
3148    
3149     if (c == 0 && nestptr != NULL)
3150     {
3151     ptr = nestptr;
3152     nestptr = NULL;
3153     c = *ptr;
3154     }
3155    
3156 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3157     previous cycle of this loop. */
3158    
3159     if (lengthptr != NULL)
3160     {
3161 ph10 475 #ifdef PCRE_DEBUG
3162 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3163     #endif
3164 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3165 nigel 93 {
3166     *errorcodeptr = ERR52;
3167     goto FAILED;
3168     }
3169    
3170     /* There is at least one situation where code goes backwards: this is the
3171     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3172     the class is simply eliminated. However, it is created first, so we have to
3173     allow memory for it. Therefore, don't ever reduce the length at this point.
3174     */
3175    
3176     if (code < last_code) code = last_code;
3177 ph10 202
3178     /* Paranoid check for integer overflow */
3179    
3180     if (OFLOW_MAX - *lengthptr < code - last_code)
3181     {
3182     *errorcodeptr = ERR20;
3183     goto FAILED;
3184     }
3185    
3186 ph10 530 *lengthptr += (int)(code - last_code);
3187 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3188    
3189     /* If "previous" is set and it is not at the start of the work space, move
3190     it back to there, in order to avoid filling up the work space. Otherwise,
3191     if "previous" is NULL, reset the current code pointer to the start. */
3192    
3193     if (previous != NULL)
3194     {
3195     if (previous > orig_code)
3196     {
3197     memmove(orig_code, previous, code - previous);
3198     code -= previous - orig_code;
3199     previous = orig_code;
3200     }
3201     }
3202     else code = orig_code;
3203    
3204     /* Remember where this code item starts so we can pick up the length
3205     next time round. */
3206    
3207     last_code = code;
3208     }
3209    
3210     /* In the real compile phase, just check the workspace used by the forward
3211     reference list. */
3212    
3213 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3214 nigel 93 {
3215     *errorcodeptr = ERR52;
3216     goto FAILED;
3217     }
3218    
3219 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3220    
3221     if (inescq && c != 0)
3222     {
3223 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3224 nigel 77 {
3225     inescq = FALSE;
3226     ptr++;
3227     continue;
3228     }
3229     else
3230     {
3231     if (previous_callout != NULL)
3232     {
3233 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3234     complete_callout(previous_callout, ptr, cd);
3235 nigel 77 previous_callout = NULL;
3236     }
3237     if ((options & PCRE_AUTO_CALLOUT) != 0)
3238     {
3239     previous_callout = code;
3240     code = auto_callout(code, ptr, cd);
3241     }
3242     goto NORMAL_CHAR;
3243     }
3244     }
3245    
3246     /* Fill in length of a previous callout, except when the next thing is
3247     a quantifier. */
3248    
3249 ph10 392 is_quantifier =
3250 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3251     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3252 nigel 77
3253     if (!is_quantifier && previous_callout != NULL &&
3254     after_manual_callout-- <= 0)
3255     {
3256 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3257     complete_callout(previous_callout, ptr, cd);
3258 nigel 77 previous_callout = NULL;
3259     }
3260    
3261 ph10 635 /* In extended mode, skip white space and comments. */
3262 nigel 77
3263     if ((options & PCRE_EXTENDED) != 0)
3264     {
3265     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3266 ph10 391 if (c == CHAR_NUMBER_SIGN)
3267 nigel 77 {
3268 ph10 579 ptr++;
3269 ph10 556 while (*ptr != 0)
3270 nigel 91 {
3271 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3272 ph10 556 ptr++;
3273 ph10 579 #ifdef SUPPORT_UTF8
3274 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3275     #endif
3276 nigel 91 }
3277 nigel 93 if (*ptr != 0) continue;
3278    
3279 nigel 91 /* Else fall through to handle end of string */
3280     c = 0;
3281 nigel 77 }
3282     }
3283    
3284     /* No auto callout for quantifiers. */
3285    
3286     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3287     {
3288     previous_callout = code;
3289     code = auto_callout(code, ptr, cd);
3290     }
3291    
3292     switch(c)
3293     {
3294 nigel 93 /* ===================================================================*/
3295     case 0: /* The branch terminates at string end */
3296 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3297     case CHAR_RIGHT_PARENTHESIS:
3298 nigel 77 *firstbyteptr = firstbyte;
3299     *reqbyteptr = reqbyte;
3300     *codeptr = code;
3301     *ptrptr = ptr;
3302 nigel 93 if (lengthptr != NULL)
3303     {
3304 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3305     {
3306     *errorcodeptr = ERR20;
3307     goto FAILED;
3308     }
3309 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3310 nigel 93 DPRINTF((">> end branch\n"));
3311     }
3312 nigel 77 return TRUE;
3313    
3314 nigel 93
3315     /* ===================================================================*/
3316 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3317     the setting of any following char as a first character. */
3318    
3319 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3320 ph10 602 previous = NULL;
3321 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3322     {
3323     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3324 ph10 602 *code++ = OP_CIRCM;
3325 nigel 77 }
3326 ph10 602 else *code++ = OP_CIRC;
3327 nigel 77 break;
3328    
3329 ph10 391 case CHAR_DOLLAR_SIGN:
3330 nigel 77 previous = NULL;
3331 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3332 nigel 77 break;
3333    
3334     /* There can never be a first char if '.' is first, whatever happens about
3335     repeats. The value of reqbyte doesn't change either. */
3336    
3337 ph10 391 case CHAR_DOT:
3338 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3339     zerofirstbyte = firstbyte;
3340     zeroreqbyte = reqbyte;
3341     previous = code;
3342 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3343 nigel 77 break;
3344    
3345 nigel 93
3346     /* ===================================================================*/
3347 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3348     32-byte bitmap of the permitted characters, except in the special case
3349     where there is only one such character. For negated classes, we build the
3350     map as usual, then invert it at the end. However, we use a different opcode
3351     so that data characters > 255 can be handled correctly.
3352 nigel 77
3353     If the class contains characters outside the 0-255 range, a different
3354     opcode is compiled. It may optionally have a bit map for characters < 256,
3355     but those above are are explicitly listed afterwards. A flag byte tells
3356     whether the bitmap is present, and whether this is a negated class or not.
3357 ph10 345
3358 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3359     default (Perl) mode, it is treated as a data character. */
3360 ph10 345
3361 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3362 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3363     {
3364     *errorcodeptr = ERR64;
3365 ph10 345 goto FAILED;
3366 ph10 336 }
3367 ph10 345 goto NORMAL_CHAR;
3368 nigel 77
3369 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3370 nigel 77 previous = code;
3371    
3372     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3373     they are encountered at the top level, so we'll do that too. */
3374    
3375 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3376 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3377 ph10 295 check_posix_syntax(ptr, &tempptr))
3378 nigel 77 {
3379 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3380 nigel 77 goto FAILED;
3381     }
3382    
3383 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3384 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3385 ph10 205 skip them too. This makes for compatibility with Perl. */
3386 ph10 208
3387 ph10 205 negate_class = FALSE;
3388     for (;;)
3389 nigel 77 {
3390     c = *(++ptr);
3391 ph10 391 if (c == CHAR_BACKSLASH)
3392 ph10 205 {
3393 ph10 392 if (ptr[1] == CHAR_E)
3394 ph10 391 ptr++;
3395 ph10 392 else if (strncmp((const char *)ptr+1,
3396     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3397 ph10 391 ptr += 3;
3398 ph10 392 else
3399 ph10 391 break;
3400 ph10 205 }
3401 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3402 ph10 205 negate_class = TRUE;
3403     else break;
3404 ph10 208 }
3405 ph10 345
3406     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3407     an initial ']' is taken as a data character -- the code below handles
3408 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3409     [^] must match any character, so generate OP_ALLANY. */
3410 ph10 345
3411 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3412 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3413 ph10 341 {
3414     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3415     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3416     zerofirstbyte = firstbyte;
3417     break;
3418 ph10 345 }
3419 nigel 77
3420 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3421     negation flag at the end, so that support for characters > 255 works
3422 ph10 264 correctly (they are all included in the class). */
3423    
3424     should_flip_negation = FALSE;
3425    
3426 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3427 nigel 93 of just a single character (as long as it's < 256). However, For higher
3428     valued UTF-8 characters, we don't yet do any optimization. */
3429 nigel 77
3430     class_charcount = 0;
3431     class_lastchar = -1;
3432    
3433 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3434     temporary bit of memory, in case the class contains only 1 character (less
3435     than 256), because in that case the compiled code doesn't use the bit map.
3436     */
3437    
3438     memset(classbits, 0, 32 * sizeof(uschar));
3439    
3440 nigel 77 #ifdef SUPPORT_UTF8
3441     class_utf8 = FALSE; /* No chars >= 256 */
3442 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3443 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3444 nigel 77 #endif
3445    
3446     /* Process characters until ] is reached. By writing this as a "do" it
3447 nigel 93 means that an initial ] is taken as a data character. At the start of the
3448     loop, c contains the first byte of the character. */
3449 nigel 77
3450 nigel 93 if (c != 0) do
3451 nigel 77 {
3452 nigel 93 const uschar *oldptr;
3453    
3454 nigel 77 #ifdef SUPPORT_UTF8
3455     if (utf8 && c > 127)
3456     { /* Braces are required because the */
3457     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3458     }
3459 ph10 535
3460 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3461 ph10 309 data and reset the pointer. This is so that very large classes that
3462 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3463 ph10 309 (which is on the stack). */
3464    
3465 ph10 300 if (lengthptr != NULL)
3466     {
3467     *lengthptr += class_utf8data - class_utf8data_base;
3468 ph10 309 class_utf8data = class_utf8data_base;
3469     }
3470    
3471 nigel 77 #endif
3472    
3473     /* Inside \Q...\E everything is literal except \E */
3474    
3475     if (inescq)
3476     {
3477 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3478 nigel 77 {
3479 nigel 93 inescq = FALSE; /* Reset literal state */
3480     ptr++; /* Skip the 'E' */
3481     continue; /* Carry on with next */
3482 nigel 77 }
3483 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3484 nigel 77 }
3485    
3486     /* Handle POSIX class names. Perl allows a negation extension of the
3487     form [:^name:]. A square bracket that doesn't match the syntax is
3488     treated as a literal. We also recognize the POSIX constructions
3489     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3490     5.6 and 5.8 do. */
3491    
3492 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3493 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3494 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3495 nigel 77 {
3496     BOOL local_negate = FALSE;
3497 nigel 87 int posix_class, taboffset, tabopt;
3498 nigel 77 register const uschar *cbits = cd->cbits;
3499 nigel 87 uschar pbits[32];
3500 nigel 77
3501 ph10 391 if (ptr[1] != CHAR_COLON)
3502 nigel 77 {
3503     *errorcodeptr = ERR31;
3504     goto FAILED;
3505     }
3506    
3507     ptr += 2;
3508 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3509 nigel 77 {
3510     local_negate = TRUE;
3511 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3512 nigel 77 ptr++;
3513     }
3514    
3515 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3516 nigel 77 if (posix_class < 0)
3517     {
3518     *errorcodeptr = ERR30;
3519     goto FAILED;
3520     }
3521    
3522     /* If matching is caseless, upper and lower are converted to
3523     alpha. This relies on the fact that the class table starts with
3524     alpha, lower, upper as the first 3 entries. */
3525    
3526     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3527     posix_class = 0;
3528 ph10 535
3529     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3530 ph10 518 different escape sequences that use Unicode properties. */
3531 ph10 535
3532 ph10 518 #ifdef SUPPORT_UCP
3533     if ((options & PCRE_UCP) != 0)
3534     {
3535     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3536     if (posix_substitutes[pc] != NULL)
3537     {
3538 ph10 535 nestptr = tempptr + 1;
3539 ph10 518 ptr = posix_substitutes[pc] - 1;
3540 ph10 535 continue;
3541     }
3542     }
3543     #endif
3544 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3545     chunk of local store because we may be adding and subtracting from it,
3546     and we don't want to subtract bits that may be in the main map already.
3547     At the end we or the result into the bit map that is being built. */
3548 nigel 77
3549     posix_class *= 3;
3550 nigel 87
3551     /* Copy in the first table (always present) */
3552    
3553     memcpy(pbits, cbits + posix_class_maps[posix_class],
3554     32 * sizeof(uschar));
3555    
3556     /* If there is a second table, add or remove it as required. */
3557    
3558     taboffset = posix_class_maps[posix_class + 1];
3559     tabopt = posix_class_maps[posix_class + 2];
3560    
3561     if (taboffset >= 0)
3562 nigel 77 {
3563 nigel 87 if (tabopt >= 0)
3564     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3565 nigel 77 else
3566 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3567 nigel 77 }
3568    
3569 nigel 87 /* Not see if we need to remove any special characters. An option
3570     value of 1 removes vertical space and 2 removes underscore. */
3571    
3572     if (tabopt < 0) tabopt = -tabopt;
3573     if (tabopt == 1) pbits[1] &= ~0x3c;
3574     else if (tabopt == 2) pbits[11] &= 0x7f;
3575    
3576     /* Add the POSIX table or its complement into the main table that is
3577     being built and we are done. */
3578    
3579     if (local_negate)
3580     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3581     else
3582     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3583    
3584 nigel 77 ptr = tempptr + 1;
3585     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3586     continue; /* End of POSIX syntax handling */
3587     }
3588    
3589     /* Backslash may introduce a single character, or it may introduce one
3590 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3591 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3592     assume that other escapes have more than one character in them, so set
3593     class_charcount bigger than one. Unrecognized escapes fall through and
3594     are either treated as literal characters (by default), or are faulted if
3595     PCRE_EXTRA is set. */
3596 nigel 77
3597 ph10 391 if (c == CHAR_BACKSLASH)
3598 nigel 77 {
3599 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3600     if (*errorcodeptr != 0) goto FAILED;
3601 nigel 77
3602 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3603 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3604     {
3605 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3606 nigel 77 {
3607     ptr += 2; /* avoid empty string */
3608     }
3609     else inescq = TRUE;
3610     continue;
3611     }
3612 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3613 nigel 77
3614     if (c < 0)
3615     {
3616     register const uschar *cbits = cd->cbits;
3617     class_charcount += 2; /* Greater than 1 is what matters */
3618 nigel 93
3619 ph10 518 switch (-c)
3620 nigel 77 {
3621 ph10 518 #ifdef SUPPORT_UCP
3622     case ESC_du: /* These are the values given for \d etc */
3623     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3624     case ESC_wu: /* escape sequence with an appropriate \p */
3625     case ESC_WU: /* or \P to test Unicode properties instead */
3626     case ESC_su: /* of the default ASCII testing. */
3627     case ESC_SU:
3628     nestptr = ptr;
3629     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3630 ph10 535 class_charcount -= 2; /* Undo! */
3631 ph10 518 continue;
3632     #endif
3633 nigel 77 case ESC_d:
3634     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3635     continue;
3636    
3637     case ESC_D:
3638 ph10 286 should_flip_negation = TRUE;
3639 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3640     continue;
3641    
3642     case ESC_w:
3643     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3644     continue;
3645    
3646     case ESC_W:
3647 ph10 286 should_flip_negation = TRUE;
3648 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3649     continue;
3650    
3651 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3652 ph10 579 if it was previously set by something earlier in the character
3653     class. */
3654 ph10 552
3655 nigel 77 case ESC_s:
3656 ph10 552 classbits[0] |= cbits[cbit_space];
3657 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3658 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3659 nigel 77 continue;
3660    
3661     case ESC_S:
3662 ph10 286 should_flip_negation = TRUE;
3663 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3664     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3665     continue;
3666    
3667 ph10 518 case ESC_h:
3668 ph10 178 SETBIT(classbits, 0x09); /* VT */
3669     SETBIT(classbits, 0x20); /* SPACE */
3670 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3671 ph10 178 #ifdef SUPPORT_UTF8
3672     if (utf8)
3673 ph10 180 {
3674 ph10 178 class_utf8 = TRUE;
3675     *class_utf8data++ = XCL_SINGLE;
3676 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3677 ph10 178 *class_utf8data++ = XCL_SINGLE;
3678 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3679     *class_utf8data++ = XCL_RANGE;
3680     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3681     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3682 ph10 178 *class_utf8data++ = XCL_SINGLE;
3683 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3684 ph10 178 *class_utf8data++ = XCL_SINGLE;
3685 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3686 ph10 178 *class_utf8data++ = XCL_SINGLE;
3687 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3688     }
3689     #endif
3690     continue;
3691 nigel 93
3692 ph10 518 case ESC_H:
3693 ph10 178 for (c = 0; c < 32; c++)
3694     {
3695     int x = 0xff;
3696     switch (c)
3697 ph10 180 {
3698 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3699     case 0x20/8: x ^= 1 << (0x20%8); break;
3700     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3701     default: break;
3702     }
3703     classbits[c] |= x;
3704 ph10 180 }
3705    
3706 ph10 178 #ifdef SUPPORT_UTF8
3707     if (utf8)
3708 ph10 180 {
3709 ph10 178 class_utf8 = TRUE;
3710 ph10 180 *class_utf8data++ = XCL_RANGE;
3711     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3712     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3713     *class_utf8data++ = XCL_RANGE;
3714     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3715     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3716     *class_utf8data++ = XCL_RANGE;
3717     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3718     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3719     *class_utf8data++ = XCL_RANGE;
3720     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3721     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3722     *class_utf8data++ = XCL_RANGE;
3723     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3724     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3725     *class_utf8data++ = XCL_RANGE;
3726     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3727     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3728     *class_utf8data++ = XCL_RANGE;
3729     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3730     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3731     }
3732     #endif
3733     continue;
3734 ph10 178
3735 ph10 518 case ESC_v:
3736 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3737     SETBIT(classbits, 0x0b); /* VT */
3738 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3739     SETBIT(classbits, 0x0d); /* CR */
3740     SETBIT(classbits, 0x85); /* NEL */
3741 ph10 178 #ifdef SUPPORT_UTF8
3742     if (utf8)
3743 ph10 180 {
3744 ph10 178 class_utf8 = TRUE;
3745 ph10 180 *class_utf8data++ = XCL_RANGE;
3746     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3747     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3748     }
3749     #endif
3750     continue;
3751 ph10 178
3752 ph10 518 case ESC_V:
3753 ph10 178 for (c = 0; c < 32; c++)
3754     {
3755     int x = 0xff;
3756     switch (c)
3757 ph10 180 {
3758 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3759     x ^= 1 << (0x0b%8);
3760     x ^= 1 << (0x0c%8);
3761 ph10 180 x ^= 1 << (0x0d%8);
3762 ph10 178 break;
3763     case 0x85/8: x ^= 1 << (0x85%8); break;
3764     default: break;
3765     }
3766     classbits[c] |= x;
3767 ph10 180 }
3768    
3769 ph10 178 #ifdef SUPPORT_UTF8
3770     if (utf8)
3771 ph10 180 {
3772 ph10 178 class_utf8 = TRUE;
3773 ph10 180 *class_utf8data++ = XCL_RANGE;
3774     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3775     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3776     *class_utf8data++ = XCL_RANGE;
3777     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3778     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3779     }
3780     #endif
3781     continue;
3782 ph10 178
3783 nigel 77 #ifdef SUPPORT_UCP
3784 ph10 518 case ESC_p:
3785     case ESC_P:
3786     {
3787     BOOL negated;
3788     int pdata;
3789     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3790     if (ptype < 0) goto FAILED;
3791     class_utf8 = TRUE;
3792     *class_utf8data++ = ((-c == ESC_p) != negated)?
3793     XCL_PROP : XCL_NOTPROP;
3794     *class_utf8data++ = ptype;
3795     *class_utf8data++ = pdata;
3796     class_charcount -= 2; /* Not a < 256 character */
3797     continue;
3798     }
3799 nigel 77 #endif
3800 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3801     strict mode. By default, for compatibility with Perl, they are
3802     treated as literals. */
3803 nigel 77
3804 ph10 518 default:
3805     if ((options & PCRE_EXTRA) != 0)
3806     {
3807     *errorcodeptr = ERR7;
3808     goto FAILED;
3809     }
3810     class_charcount -= 2; /* Undo the default count from above */
3811     c = *ptr; /* Get the final character and fall through */
3812     break;
3813 nigel 93 }
3814 nigel 77 }
3815    
3816     /* Fall through if we have a single character (c >= 0). This may be
3817 nigel 93 greater than 256 in UTF-8 mode. */
3818 nigel 77
3819     } /* End of backslash handling */
3820    
3821     /* A single character may be followed by '-' to form a range. However,
3822     Perl does not permit ']' to be the end of the range. A '-' character
3823 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3824     entirely. The code for handling \Q and \E is messy. */
3825 nigel 77
3826 nigel 93 CHECK_RANGE:
3827 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3828 nigel 77 {
3829 nigel 93 inescq = FALSE;
3830     ptr += 2;
3831     }
3832    
3833     oldptr = ptr;
3834 ph10 231
3835 ph10 230 /* Remember \r or \n */
3836 ph10 231
3837 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3838 ph10 231
3839 ph10 230 /* Check for range */
3840 nigel 93
3841 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3842 nigel 93 {
3843 nigel 77 int d;
3844     ptr += 2;
3845 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3846 nigel 77
3847 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3848     mode. */
3849    
3850 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3851 nigel 93 {
3852     ptr += 2;
3853 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3854 ph10 391 { ptr += 2; continue; }
3855 nigel 93 inescq = TRUE;
3856     break;
3857     }
3858    
3859 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3860 nigel 93 {
3861     ptr = oldptr;
3862     goto LONE_SINGLE_CHARACTER;
3863     }
3864    
3865 nigel 77 #ifdef SUPPORT_UTF8
3866     if (utf8)
3867     { /* Braces are required because the */
3868     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3869     }
3870     else
3871     #endif
3872     d = *ptr; /* Not UTF-8 mode */
3873    
3874     /* The second part of a range can be a single-character escape, but
3875     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3876     in such circumstances. */
3877    
3878 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3879 nigel 77 {
3880 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3881     if (*errorcodeptr != 0) goto FAILED;
3882 nigel 77
3883 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3884 nigel 77
3885     if (d < 0)
3886     {
3887 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3888 nigel 77 {
3889 nigel 93 ptr = oldptr;
3890 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3891     }
3892     }
3893     }
3894    
3895 nigel 93 /* Check that the two values are in the correct order. Optimize
3896     one-character ranges */
3897 nigel 77
3898 nigel 93 if (d < c)
3899     {
3900     *errorcodeptr = ERR8;
3901     goto FAILED;
3902     }
3903    
3904 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3905    
3906 ph10 230 /* Remember \r or \n */
3907 ph10 231
3908 ph10 391 if (d == CHAR_CR || d == CHAR_NL) c