/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 629 - (hide annotations) (download)
Fri Jul 22 09:18:11 2011 UTC (3 years, 3 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 241147 byte(s)
Fix isolated \k bug.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 629 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413 ph10 510 ;
414 nigel 77
415     /* Table to identify digits and hex digits. This is used when compiling
416     patterns. Note that the tables in chartables are dependent on the locale, and
417     may mark arbitrary characters as digits - but the PCRE compiling code expects
418     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
419     a private table here. It costs 256 bytes, but it is a lot faster than doing
420     character value tests (at least in some simple cases I timed), and in some
421     applications one wants PCRE to compile efficiently as well as match
422     efficiently.
423    
424     For convenience, we use the same bit definitions as in chartables:
425    
426     0x04 decimal digit
427     0x08 hexadecimal digit
428    
429     Then we can use ctype_digit and ctype_xdigit in the code. */
430    
431 ph10 392 #ifndef EBCDIC
432 ph10 391
433 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
434 ph10 391 UTF-8 mode. */
435    
436 nigel 77 static const unsigned char digitab[] =
437     {
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
444     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
445     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
446     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
450     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
470    
471 ph10 392 #else
472 ph10 391
473     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
474    
475 nigel 77 static const unsigned char digitab[] =
476     {
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
488 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
489 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
493     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
501     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
507     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
508     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
509    
510     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
511     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
512     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
513     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
515     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
518     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
519     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
520     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
521     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
522 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
523 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
524     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
526     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
527     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
528     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
529     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
531     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
532     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
533     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
534     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
535     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
537     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
539     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
540     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
541     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
542     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
543     #endif
544    
545    
546     /* Definition to allow mutual recursion */
547    
548     static BOOL
549 ph10 604 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,
550     int *, branch_chain *, compile_data *, int *);
551 nigel 77
552    
553    
554     /*************************************************
555 ph10 240 * Find an error text *
556     *************************************************/
557    
558 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
559     some of the text is of unknown length, we can't use a table of offsets.
560     Instead, just count through the strings. This is not a performance issue
561 ph10 240 because it happens only when there has been a compilation error.
562    
563     Argument: the error number
564     Returns: pointer to the error string
565     */
566    
567     static const char *
568     find_error_text(int n)
569     {
570     const char *s = error_texts;
571 ph10 507 for (; n > 0; n--)
572 ph10 499 {
573     while (*s++ != 0) {};
574     if (*s == 0) return "Error text not found (please report)";
575 ph10 507 }
576 ph10 240 return s;
577     }
578    
579    
580     /*************************************************
581 nigel 77 * Handle escapes *
582     *************************************************/
583    
584     /* This function is called when a \ has been encountered. It either returns a
585     positive value for a simple escape such as \n, or a negative value which
586 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
587     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
588     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
589     ptr is pointing at the \. On exit, it is on the final character of the escape
590     sequence.
591 nigel 77
592     Arguments:
593     ptrptr points to the pattern position pointer
594     errorcodeptr points to the errorcode variable
595     bracount number of previous extracting brackets
596     options the options bits
597     isclass TRUE if inside a character class
598    
599     Returns: zero or positive => a data character
600     negative => a special escape sequence
601 ph10 213 on error, errorcodeptr is set
602 nigel 77 */
603    
604     static int
605     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
606     int options, BOOL isclass)
607     {
608 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
609     const uschar *ptr = *ptrptr + 1;
610 nigel 77 int c, i;
611    
612 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
613     ptr--; /* Set pointer back to the last byte */
614    
615 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
616    
617     if (c == 0) *errorcodeptr = ERR1;
618    
619 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
620     in a table. A non-zero result is something that can be returned immediately.
621 nigel 77 Otherwise further processing may be required. */
622    
623 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
624     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
625     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
626 nigel 77
627 ph10 97 #else /* EBCDIC coding */
628 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
629 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
630     #endif
631    
632     /* Escapes that need further processing, or are illegal. */
633    
634     else
635     {
636     const uschar *oldptr;
637 nigel 93 BOOL braced, negated;
638    
639 nigel 77 switch (c)
640     {
641     /* A number of Perl escapes are not handled by PCRE. We give an explicit
642     error. */
643    
644 ph10 391 case CHAR_l:
645     case CHAR_L:
646     case CHAR_u:
647     case CHAR_U:
648 nigel 77 *errorcodeptr = ERR37;
649     break;
650    
651 ph10 333 /* \g must be followed by one of a number of specific things:
652 ph10 345
653 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
654     backreference. If negative, it is a relative backreference. This is a Perl
655     5.10 feature.
656 ph10 345
657 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
658     is part of Perl's movement towards a unified syntax for back references. As
659     this is synonymous with \k{name}, we fudge it up by pretending it really
660     was \k.
661 ph10 345
662     (3) For Oniguruma compatibility we also support \g followed by a name or a
663     number either in angle brackets or in single quotes. However, these are
664     (possibly recursive) subroutine calls, _not_ backreferences. Just return
665 ph10 333 the -ESC_g code (cf \k). */
666 nigel 93
667 ph10 391 case CHAR_g:
668     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
669 ph10 333 {
670     c = -ESC_g;
671 ph10 345 break;
672     }
673 ph10 333
674     /* Handle the Perl-compatible cases */
675 ph10 345
676 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
677 nigel 93 {
678 ph10 171 const uschar *p;
679 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
680     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
681     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
682 ph10 171 {
683     c = -ESC_k;
684     break;
685 ph10 172 }
686 nigel 93 braced = TRUE;
687     ptr++;
688     }
689     else braced = FALSE;
690    
691 ph10 391 if (ptr[1] == CHAR_MINUS)
692 nigel 93 {
693     negated = TRUE;
694     ptr++;
695     }
696     else negated = FALSE;
697    
698     c = 0;
699     while ((digitab[ptr[1]] & ctype_digit) != 0)
700 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
701 ph10 220
702 ph10 333 if (c < 0) /* Integer overflow */
703 ph10 213 {
704     *errorcodeptr = ERR61;
705     break;
706 ph10 220 }
707 ph10 345
708 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
709 nigel 93 {
710     *errorcodeptr = ERR57;
711 ph10 213 break;
712 nigel 93 }
713 ph10 345
714 ph10 333 if (c == 0)
715     {
716     *errorcodeptr = ERR58;
717     break;
718 ph10 345 }
719 nigel 93
720     if (negated)
721     {
722     if (c > bracount)
723     {
724     *errorcodeptr = ERR15;
725 ph10 213 break;
726 nigel 93 }
727     c = bracount - (c - 1);
728     }
729    
730     c = -(ESC_REF + c);
731     break;
732    
733 nigel 77 /* The handling of escape sequences consisting of a string of digits
734     starting with one that is not zero is not straightforward. By experiment,
735     the way Perl works seems to be as follows:
736    
737     Outside a character class, the digits are read as a decimal number. If the
738     number is less than 10, or if there are that many previous extracting
739     left brackets, then it is a back reference. Otherwise, up to three octal
740     digits are read to form an escaped byte. Thus \123 is likely to be octal
741     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
742     value is greater than 377, the least significant 8 bits are taken. Inside a
743     character class, \ followed by a digit is always an octal number. */
744    
745 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
746     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
747 nigel 77
748     if (!isclass)
749     {
750     oldptr = ptr;
751 ph10 391 c -= CHAR_0;
752 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
753 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
754 ph10 333 if (c < 0) /* Integer overflow */
755 ph10 213 {
756     *errorcodeptr = ERR61;
757 ph10 220 break;
758     }
759 nigel 77 if (c < 10 || c <= bracount)
760     {
761     c = -(ESC_REF + c);
762     break;
763     }
764     ptr = oldptr; /* Put the pointer back and fall through */
765     }
766    
767     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
768     generates a binary zero byte and treats the digit as a following literal.
769     Thus we have to pull back the pointer by one. */
770    
771 ph10 391 if ((c = *ptr) >= CHAR_8)
772 nigel 77 {
773     ptr--;
774     c = 0;
775     break;
776     }
777    
778     /* \0 always starts an octal number, but we may drop through to here with a
779 nigel 91 larger first octal digit. The original code used just to take the least
780     significant 8 bits of octal numbers (I think this is what early Perls used
781     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
782     than 3 octal digits. */
783 nigel 77
784 ph10 391 case CHAR_0:
785     c -= CHAR_0;
786     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
787     c = c * 8 + *(++ptr) - CHAR_0;
788 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
789 nigel 77 break;
790    
791 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
792     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
793     treated as a data character. */
794 nigel 77
795 ph10 391 case CHAR_x:
796     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
797 nigel 77 {
798     const uschar *pt = ptr + 2;
799 nigel 87 int count = 0;
800    
801 nigel 77 c = 0;
802     while ((digitab[*pt] & ctype_xdigit) != 0)
803     {
804 nigel 87 register int cc = *pt++;
805 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
806 nigel 77 count++;
807 nigel 87
808 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
809     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
810     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
811 ph10 97 #else /* EBCDIC coding */
812 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
813     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
814 nigel 77 #endif
815     }
816 nigel 87
817 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
818 nigel 77 {
819 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
820 nigel 77 ptr = pt;
821     break;
822     }
823 nigel 87
824 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
825     recognize this construct; fall through to the normal \x handling. */
826     }
827    
828 nigel 87 /* Read just a single-byte hex-defined char */
829 nigel 77
830     c = 0;
831     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
832     {
833 ph10 391 int cc; /* Some compilers don't like */
834     cc = *(++ptr); /* ++ in initializers */
835     #ifndef EBCDIC /* ASCII/UTF-8 coding */
836     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
837     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
838 ph10 97 #else /* EBCDIC coding */
839 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
840     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
841 nigel 77 #endif
842     }
843     break;
844    
845 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
846 ph10 574 An error is given if the byte following \c is not an ASCII character. This
847     coding is ASCII-specific, but then the whole concept of \cx is
848 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
849 nigel 77
850 ph10 391 case CHAR_c:
851 nigel 77 c = *(++ptr);
852     if (c == 0)
853     {
854     *errorcodeptr = ERR2;
855 ph10 213 break;
856 nigel 77 }
857 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
858     if (c > 127) /* Excludes all non-ASCII in either mode */
859     {
860     *errorcodeptr = ERR68;
861 ph10 579 break;
862     }
863 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
864 nigel 77 c ^= 0x40;
865 ph10 574 #else /* EBCDIC coding */
866 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
867 nigel 77 c ^= 0xC0;
868     #endif
869     break;
870    
871     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
872 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
873     otherwise, for Perl compatibility, it is a literal. This code looks a bit
874     odd, but there used to be some cases other than the default, and there may
875     be again in future, so I haven't "optimized" it. */
876 nigel 77
877     default:
878     if ((options & PCRE_EXTRA) != 0) switch(c)
879     {
880     default:
881     *errorcodeptr = ERR3;
882     break;
883     }
884     break;
885     }
886     }
887 ph10 518
888     /* Perl supports \N{name} for character names, as well as plain \N for "not
889 ph10 514 newline". PCRE does not support \N{name}. */
890 nigel 77
891 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
892 ph10 518 *errorcodeptr = ERR37;
893 ph10 514
894 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
895    
896     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
897     c -= (ESC_DU - ESC_D);
898    
899     /* Set the pointer to the final character before returning. */
900    
901 nigel 77 *ptrptr = ptr;
902     return c;
903     }
904    
905    
906    
907     #ifdef SUPPORT_UCP
908     /*************************************************
909     * Handle \P and \p *
910     *************************************************/
911    
912     /* This function is called after \P or \p has been encountered, provided that
913     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
914     pointing at the P or p. On exit, it is pointing at the final character of the
915     escape sequence.
916    
917     Argument:
918     ptrptr points to the pattern position pointer
919     negptr points to a boolean that is set TRUE for negation else FALSE
920 nigel 87 dptr points to an int that is set to the detailed property value
921 nigel 77 errorcodeptr points to the error code variable
922    
923 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
924 nigel 77 */
925    
926     static int
927 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
928 nigel 77 {
929     int c, i, bot, top;
930     const uschar *ptr = *ptrptr;
931 nigel 87 char name[32];
932 nigel 77
933     c = *(++ptr);
934     if (c == 0) goto ERROR_RETURN;
935    
936     *negptr = FALSE;
937    
938 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
939     negation. */
940 nigel 77
941 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
942 nigel 77 {
943 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
944 nigel 77 {
945     *negptr = TRUE;
946     ptr++;
947     }
948 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
949 nigel 77 {
950     c = *(++ptr);
951     if (c == 0) goto ERROR_RETURN;
952 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
953 nigel 77 name[i] = c;
954     }
955 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
956 nigel 77 name[i] = 0;
957     }
958    
959     /* Otherwise there is just one following character */
960    
961     else
962     {
963     name[0] = c;
964     name[1] = 0;
965     }
966    
967     *ptrptr = ptr;
968    
969     /* Search for a recognized property name using binary chop */
970    
971     bot = 0;
972     top = _pcre_utt_size;
973    
974     while (bot < top)
975     {
976 nigel 87 i = (bot + top) >> 1;
977 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
978 nigel 87 if (c == 0)
979     {
980     *dptr = _pcre_utt[i].value;
981     return _pcre_utt[i].type;
982     }
983 nigel 77 if (c > 0) bot = i + 1; else top = i;
984     }
985    
986     *errorcodeptr = ERR47;
987     *ptrptr = ptr;
988     return -1;
989    
990     ERROR_RETURN:
991     *errorcodeptr = ERR46;
992     *ptrptr = ptr;
993     return -1;
994     }
995     #endif
996    
997    
998    
999    
1000     /*************************************************
1001     * Check for counted repeat *
1002     *************************************************/
1003    
1004     /* This function is called when a '{' is encountered in a place where it might
1005     start a quantifier. It looks ahead to see if it really is a quantifier or not.
1006     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1007     where the ddds are digits.
1008    
1009     Arguments:
1010     p pointer to the first char after '{'
1011    
1012     Returns: TRUE or FALSE
1013     */
1014    
1015     static BOOL
1016     is_counted_repeat(const uschar *p)
1017     {
1018     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019     while ((digitab[*p] & ctype_digit) != 0) p++;
1020 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1021 nigel 77
1022 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1023     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1024 nigel 77
1025     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1026     while ((digitab[*p] & ctype_digit) != 0) p++;
1027    
1028 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1029 nigel 77 }
1030    
1031    
1032    
1033     /*************************************************
1034     * Read repeat counts *
1035     *************************************************/
1036    
1037     /* Read an item of the form {n,m} and return the values. This is called only
1038     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1039     so the syntax is guaranteed to be correct, but we need to check the values.
1040    
1041     Arguments:
1042     p pointer to first char after '{'
1043     minp pointer to int for min
1044     maxp pointer to int for max
1045     returned as -1 if no max
1046     errorcodeptr points to error code variable
1047    
1048     Returns: pointer to '}' on success;
1049     current ptr on error, with errorcodeptr set non-zero
1050     */
1051    
1052     static const uschar *
1053     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1054     {
1055     int min = 0;
1056     int max = -1;
1057    
1058 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1059     an integer overflow. */
1060    
1061 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1062 nigel 81 if (min < 0 || min > 65535)
1063     {
1064     *errorcodeptr = ERR5;
1065     return p;
1066     }
1067 nigel 77
1068 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1069     Also, max must not be less than min. */
1070    
1071 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1072 nigel 77 {
1073 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1074 nigel 77 {
1075     max = 0;
1076 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1077 nigel 81 if (max < 0 || max > 65535)
1078     {
1079     *errorcodeptr = ERR5;
1080     return p;
1081     }
1082 nigel 77 if (max < min)
1083     {
1084     *errorcodeptr = ERR4;
1085     return p;
1086     }
1087     }
1088     }
1089    
1090 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1091     '}'. */
1092 nigel 77
1093 nigel 81 *minp = min;
1094     *maxp = max;
1095 nigel 77 return p;
1096     }
1097    
1098    
1099    
1100     /*************************************************
1101 ph10 408 * Subroutine for finding forward reference *
1102 nigel 91 *************************************************/
1103    
1104 ph10 408 /* This recursive function is called only from find_parens() below. The
1105     top-level call starts at the beginning of the pattern. All other calls must
1106     start at a parenthesis. It scans along a pattern's text looking for capturing
1107 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1108     name it is given, it returns its number. Alternatively, if the name is NULL, it
1109 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1110     track of subpatterns that reset the capturing group numbers - the (?| feature.
1111 nigel 91
1112 ph10 578 This function was originally called only from the second pass, in which we know
1113     that if (?< or (?' or (?P< is encountered, the name will be correctly
1114     terminated because that is checked in the first pass. There is now one call to
1115     this function in the first pass, to check for a recursive back reference by
1116     name (so that we can make the whole group atomic). In this case, we need check
1117 ph10 579 only up to the current position in the pattern, and that is still OK because
1118     and previous occurrences will have been checked. To make this work, the test
1119     for "end of pattern" is a check against cd->end_pattern in the main loop,
1120 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1121 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1122     processing items within the loop are OK, because afterwards the main loop will
1123 ph10 578 terminate.)
1124    
1125 nigel 91 Arguments:
1126 ph10 408 ptrptr address of the current character pointer (updated)
1127 ph10 345 cd compile background data
1128 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1129     lorn name length, or subpattern number if name is NULL
1130     xmode TRUE if we are in /x mode
1131 ph10 579 utf8 TRUE if we are in UTF-8 mode
1132 ph10 411 count pointer to the current capturing subpattern number (updated)
1133 nigel 91
1134     Returns: the number of the named subpattern, or -1 if not found
1135     */
1136    
1137     static int
1138 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1139 ph10 556 BOOL xmode, BOOL utf8, int *count)
1140 nigel 91 {
1141 ph10 408 uschar *ptr = *ptrptr;
1142     int start_count = *count;
1143     int hwm_count = start_count;
1144     BOOL dup_parens = FALSE;
1145 nigel 93
1146 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1147 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1148    
1149     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1150     {
1151 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1152 ph10 545
1153 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1154 ph10 545
1155 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1156 ph10 408
1157 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1158 ph10 408 {
1159     *count += 1;
1160     if (name == NULL && *count == lorn) return *count;
1161 ph10 411 ptr++;
1162 ph10 408 }
1163    
1164 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1165     where the parenthesis numbers are duplicated. */
1166    
1167     else if (ptr[2] == CHAR_VERTICAL_LINE)
1168     {
1169     ptr += 3;
1170     dup_parens = TRUE;
1171     }
1172 ph10 545
1173 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1174    
1175     else if (ptr[2] == CHAR_NUMBER_SIGN)
1176     {
1177     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1178     goto FAIL_EXIT;
1179 ph10 545 }
1180 ph10 544
1181 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1182     is processed as normal. If not, skip to the closing parenthesis of the
1183 ph10 544 condition (there can't be any nested parens). */
1184 ph10 411
1185 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1186     {
1187 ph10 411 ptr += 2;
1188 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1189     {
1190     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1191 ph10 411 if (*ptr != 0) ptr++;
1192 ph10 408 }
1193 ph10 411 }
1194    
1195 ph10 544 /* Start with (? but not a condition. */
1196 ph10 408
1197     else
1198 ph10 411 {
1199 ph10 408 ptr += 2;
1200     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1201    
1202     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1203 ph10 411
1204 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1205     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1206     {
1207     int term;
1208     const uschar *thisname;
1209     *count += 1;
1210     if (name == NULL && *count == lorn) return *count;
1211     term = *ptr++;
1212     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1213     thisname = ptr;
1214     while (*ptr != term) ptr++;
1215     if (name != NULL && lorn == ptr - thisname &&
1216     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1217     return *count;
1218 ph10 461 term++;
1219 ph10 411 }
1220 ph10 408 }
1221 ph10 411 }
1222 ph10 408
1223 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1224 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1225     first-pass call when this value is temporarily adjusted to stop at the current
1226 ph10 578 position. So DO NOT change this to a test for binary zero. */
1227 ph10 408
1228 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1229 nigel 91 {
1230 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1231    
1232 ph10 391 if (*ptr == CHAR_BACKSLASH)
1233 nigel 93 {
1234 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1235 ph10 391 if (*ptr == CHAR_Q) for (;;)
1236 nigel 93 {
1237 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1238 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1239 ph10 391 if (*(++ptr) == CHAR_E) break;
1240 nigel 93 }
1241     continue;
1242     }
1243    
1244 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1245     are handled for real. If the first character is '^', skip it. Also, if the
1246     first few characters (either before or after ^) are \Q\E or \E we skip them
1247 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1248 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1249 nigel 93
1250 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1251 nigel 93 {
1252 ph10 340 BOOL negate_class = FALSE;
1253     for (;;)
1254     {
1255 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1256 ph10 340 {
1257 ph10 438 if (ptr[2] == CHAR_E)
1258     ptr+= 2;
1259     else if (strncmp((const char *)ptr+2,
1260 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1261 ph10 438 ptr += 4;
1262 ph10 392 else
1263 ph10 391 break;
1264 ph10 340 }
1265 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1266 ph10 461 {
1267 ph10 340 negate_class = TRUE;
1268 ph10 438 ptr++;
1269 ph10 461 }
1270 ph10 340 else break;
1271     }
1272    
1273     /* If the next character is ']', it is a data character that must be
1274 ph10 341 skipped, except in JavaScript compatibility mode. */
1275 ph10 345
1276 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1277 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1278 ph10 345 ptr++;
1279    
1280 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1281 nigel 93 {
1282 ph10 220 if (*ptr == 0) return -1;
1283 ph10 391 if (*ptr == CHAR_BACKSLASH)
1284 nigel 93 {
1285 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1286 ph10 391 if (*ptr == CHAR_Q) for (;;)
1287 nigel 93 {
1288 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1289 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1290 ph10 391 if (*(++ptr) == CHAR_E) break;
1291 nigel 93 }
1292     continue;
1293     }
1294     }
1295     continue;
1296     }
1297    
1298     /* Skip comments in /x mode */
1299    
1300 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1301 nigel 93 {
1302 ph10 579 ptr++;
1303 ph10 556 while (*ptr != 0)
1304     {
1305     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1306     ptr++;
1307 ph10 579 #ifdef SUPPORT_UTF8
1308 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1309     #endif
1310     }
1311 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1312 nigel 93 continue;
1313     }
1314    
1315 ph10 408 /* Check for the special metacharacters */
1316 ph10 411
1317 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1318 nigel 93 {
1319 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1320 ph10 408 if (rc > 0) return rc;
1321     if (*ptr == 0) goto FAIL_EXIT;
1322 nigel 93 }
1323 ph10 411
1324 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1325     {
1326     if (dup_parens && *count < hwm_count) *count = hwm_count;
1327 ph10 545 goto FAIL_EXIT;
1328 ph10 408 }
1329 ph10 411
1330     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1331 ph10 408 {
1332     if (*count > hwm_count) hwm_count = *count;
1333     *count = start_count;
1334 ph10 411 }
1335 ph10 408 }
1336 nigel 93
1337 ph10 408 FAIL_EXIT:
1338     *ptrptr = ptr;
1339     return -1;
1340     }
1341 nigel 93
1342    
1343    
1344    
1345 ph10 408 /*************************************************
1346     * Find forward referenced subpattern *
1347     *************************************************/
1348 nigel 93
1349 ph10 408 /* This function scans along a pattern's text looking for capturing
1350     subpatterns, and counting them. If it finds a named pattern that matches the
1351     name it is given, it returns its number. Alternatively, if the name is NULL, it
1352     returns when it reaches a given numbered subpattern. This is used for forward
1353     references to subpatterns. We used to be able to start this scan from the
1354     current compiling point, using the current count value from cd->bracount, and
1355     do it all in a single loop, but the addition of the possibility of duplicate
1356     subpattern numbers means that we have to scan from the very start, in order to
1357     take account of such duplicates, and to use a recursive function to keep track
1358     of the different types of group.
1359    
1360     Arguments:
1361     cd compile background data
1362     name name to seek, or NULL if seeking a numbered subpattern
1363     lorn name length, or subpattern number if name is NULL
1364     xmode TRUE if we are in /x mode
1365 ph10 579 utf8 TRUE if we are in UTF-8 mode
1366 ph10 408
1367     Returns: the number of the found subpattern, or -1 if not found
1368     */
1369    
1370     static int
1371 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1372     BOOL utf8)
1373 ph10 408 {
1374     uschar *ptr = (uschar *)cd->start_pattern;
1375     int count = 0;
1376     int rc;
1377    
1378     /* If the pattern does not start with an opening parenthesis, the first call
1379     to find_parens_sub() will scan right to the end (if necessary). However, if it
1380     does start with a parenthesis, find_parens_sub() will return when it hits the
1381     matching closing parens. That is why we have to have a loop. */
1382    
1383 ph10 411 for (;;)
1384     {
1385 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1386 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1387     }
1388    
1389 ph10 408 return rc;
1390 nigel 91 }
1391    
1392    
1393    
1394 ph10 408
1395 nigel 91 /*************************************************
1396 nigel 77 * Find first significant op code *
1397     *************************************************/
1398    
1399     /* This is called by several functions that scan a compiled expression looking
1400     for a fixed first character, or an anchoring op code etc. It skips over things
1401 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1402     forward and all backward assertions, and also the \b assertion; for others it
1403     does not.
1404 nigel 77
1405     Arguments:
1406     code pointer to the start of the group
1407     skipassert TRUE if certain assertions are to be skipped
1408    
1409     Returns: pointer to the first significant opcode
1410     */
1411    
1412     static const uschar*
1413 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1414 nigel 77 {
1415     for (;;)
1416     {
1417     switch ((int)*code)
1418     {
1419     case OP_ASSERT_NOT:
1420     case OP_ASSERTBACK:
1421     case OP_ASSERTBACK_NOT:
1422     if (!skipassert) return code;
1423     do code += GET(code, 1); while (*code == OP_ALT);
1424     code += _pcre_OP_lengths[*code];
1425     break;
1426    
1427     case OP_WORD_BOUNDARY:
1428     case OP_NOT_WORD_BOUNDARY:
1429     if (!skipassert) return code;
1430     /* Fall through */
1431    
1432     case OP_CALLOUT:
1433     case OP_CREF:
1434 ph10 459 case OP_NCREF:
1435 nigel 93 case OP_RREF:
1436 ph10 459 case OP_NRREF:
1437 nigel 93 case OP_DEF:
1438 nigel 77 code += _pcre_OP_lengths[*code];
1439     break;
1440    
1441     default:
1442     return code;
1443     }
1444     }
1445     /* Control never reaches here */
1446     }
1447    
1448    
1449    
1450    
1451     /*************************************************
1452 ph10 454 * Find the fixed length of a branch *
1453 nigel 77 *************************************************/
1454    
1455 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1456 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1457 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1458 ph10 454 temporarily terminated with OP_END when this function is called.
1459 nigel 77
1460 ph10 461 This function is called when a backward assertion is encountered, so that if it
1461     fails, the error message can point to the correct place in the pattern.
1462 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1463 ph10 461 because they can be forward references. We solve this by remembering this case
1464 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1465    
1466 nigel 77 Arguments:
1467     code points to the start of the pattern (the bracket)
1468 ph10 604 utf8 TRUE in UTF-8 mode
1469 ph10 461 atend TRUE if called when the pattern is complete
1470     cd the "compile data" structure
1471 nigel 77
1472 ph10 461 Returns: the fixed length,
1473 ph10 454 or -1 if there is no fixed length,
1474 nigel 77 or -2 if \C was encountered
1475 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1476 nigel 77 */
1477    
1478     static int
1479 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1480 nigel 77 {
1481     int length = -1;
1482    
1483     register int branchlength = 0;
1484     register uschar *cc = code + 1 + LINK_SIZE;
1485    
1486     /* Scan along the opcodes for this branch. If we get to the end of the
1487     branch, check the length against that of the other branches. */
1488    
1489     for (;;)
1490     {
1491     int d;
1492 ph10 454 uschar *ce, *cs;
1493 nigel 77 register int op = *cc;
1494     switch (op)
1495     {
1496 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1497     OP_BRA (normal non-capturing bracket) because the other variants of these
1498     opcodes are all concerned with unlimited repeated groups, which of course
1499     are not of fixed length. They will cause a -1 response from the default
1500     case of this switch. */
1501    
1502 nigel 93 case OP_CBRA:
1503 nigel 77 case OP_BRA:
1504     case OP_ONCE:
1505     case OP_COND:
1506 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1507 nigel 77 if (d < 0) return d;
1508     branchlength += d;
1509     do cc += GET(cc, 1); while (*cc == OP_ALT);
1510     cc += 1 + LINK_SIZE;
1511     break;
1512    
1513     /* Reached end of a branch; if it's a ket it is the end of a nested
1514     call. If it's ALT it is an alternation in a nested call. If it is
1515 ph10 604 END it's the end of the outer call. All can be handled by the same code.
1516     Note that we must not include the OP_KETRxxx opcodes here, because they
1517     all imply an unlimited repeat. */
1518 nigel 77
1519     case OP_ALT:
1520     case OP_KET:
1521     case OP_END:
1522     if (length < 0) length = branchlength;
1523     else if (length != branchlength) return -1;
1524     if (*cc != OP_ALT) return length;
1525     cc += 1 + LINK_SIZE;
1526     branchlength = 0;
1527     break;
1528 ph10 461
1529 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1530     be OK. If the subroutine is a forward reference, we can't deal with
1531     it until the end of the pattern, so return -3. */
1532 ph10 461
1533 ph10 454 case OP_RECURSE:
1534     if (!atend) return -3;
1535     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1536     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1537     if (cc > cs && cc < ce) return -1; /* Recursion */
1538 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1539 ph10 461 if (d < 0) return d;
1540 ph10 454 branchlength += d;
1541     cc += 1 + LINK_SIZE;
1542 ph10 461 break;
1543 nigel 77
1544     /* Skip over assertive subpatterns */
1545    
1546     case OP_ASSERT:
1547     case OP_ASSERT_NOT:
1548     case OP_ASSERTBACK:
1549     case OP_ASSERTBACK_NOT:
1550     do cc += GET(cc, 1); while (*cc == OP_ALT);
1551     /* Fall through */
1552    
1553     /* Skip over things that don't match chars */
1554    
1555     case OP_REVERSE:
1556     case OP_CREF:
1557 ph10 459 case OP_NCREF:
1558 nigel 93 case OP_RREF:
1559 ph10 459 case OP_NRREF:
1560 nigel 93 case OP_DEF:
1561 nigel 77 case OP_CALLOUT:
1562     case OP_SOD:
1563     case OP_SOM:
1564 ph10 500 case OP_SET_SOM:
1565 nigel 77 case OP_EOD:
1566     case OP_EODN:
1567     case OP_CIRC:
1568 ph10 602 case OP_CIRCM:
1569 nigel 77 case OP_DOLL:
1570 ph10 602 case OP_DOLLM:
1571 nigel 77 case OP_NOT_WORD_BOUNDARY:
1572     case OP_WORD_BOUNDARY:
1573     cc += _pcre_OP_lengths[*cc];
1574     break;
1575    
1576     /* Handle literal characters */
1577    
1578     case OP_CHAR:
1579 ph10 602 case OP_CHARI:
1580 nigel 91 case OP_NOT:
1581 ph10 604 case OP_NOTI:
1582 nigel 77 branchlength++;
1583     cc += 2;
1584     #ifdef SUPPORT_UTF8
1585 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1586 nigel 77 #endif
1587     break;
1588    
1589     /* Handle exact repetitions. The count is already in characters, but we
1590     need to skip over a multibyte character in UTF8 mode. */
1591    
1592     case OP_EXACT:
1593     branchlength += GET2(cc,1);
1594     cc += 4;
1595     #ifdef SUPPORT_UTF8
1596 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1597 nigel 77 #endif
1598     break;
1599    
1600     case OP_TYPEEXACT:
1601     branchlength += GET2(cc,1);
1602 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1603 nigel 77 cc += 4;
1604     break;
1605    
1606     /* Handle single-char matchers */
1607    
1608     case OP_PROP:
1609     case OP_NOTPROP:
1610 nigel 87 cc += 2;
1611 nigel 77 /* Fall through */
1612    
1613     case OP_NOT_DIGIT:
1614     case OP_DIGIT:
1615     case OP_NOT_WHITESPACE:
1616     case OP_WHITESPACE:
1617     case OP_NOT_WORDCHAR:
1618     case OP_WORDCHAR:
1619     case OP_ANY:
1620 ph10 342 case OP_ALLANY:
1621 nigel 77 branchlength++;
1622     cc++;
1623     break;
1624    
1625     /* The single-byte matcher isn't allowed */
1626    
1627     case OP_ANYBYTE:
1628     return -2;
1629    
1630     /* Check a class for variable quantification */
1631    
1632     #ifdef SUPPORT_UTF8
1633     case OP_XCLASS:
1634     cc += GET(cc, 1) - 33;
1635     /* Fall through */
1636     #endif
1637    
1638     case OP_CLASS:
1639     case OP_NCLASS:
1640     cc += 33;
1641    
1642     switch (*cc)
1643     {
1644     case OP_CRSTAR:
1645     case OP_CRMINSTAR:
1646     case OP_CRQUERY:
1647     case OP_CRMINQUERY:
1648     return -1;
1649    
1650     case OP_CRRANGE:
1651     case OP_CRMINRANGE:
1652     if (GET2(cc,1) != GET2(cc,3)) return -1;
1653     branchlength += GET2(cc,1);
1654     cc += 5;
1655     break;
1656    
1657     default:
1658     branchlength++;
1659     }
1660     break;
1661    
1662     /* Anything else is variable length */
1663    
1664     default:
1665     return -1;
1666     }
1667     }
1668     /* Control never gets here */
1669     }
1670    
1671    
1672    
1673    
1674     /*************************************************
1675 ph10 454 * Scan compiled regex for specific bracket *
1676 nigel 77 *************************************************/
1677    
1678     /* This little function scans through a compiled pattern until it finds a
1679 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1680 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1681     so that it can be called from pcre_study() when finding the minimum matching
1682 ph10 455 length.
1683 nigel 77
1684     Arguments:
1685     code points to start of expression
1686     utf8 TRUE in UTF-8 mode
1687 ph10 454 number the required bracket number or negative to find a lookbehind
1688 nigel 77
1689     Returns: pointer to the opcode for the bracket, or NULL if not found
1690     */
1691    
1692 ph10 455 const uschar *
1693     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1694 nigel 77 {
1695     for (;;)
1696     {
1697     register int c = *code;
1698 ph10 618
1699 nigel 77 if (c == OP_END) return NULL;
1700 nigel 91
1701     /* XCLASS is used for classes that cannot be represented just by a bit
1702     map. This includes negated single high-valued characters. The length in
1703     the table is zero; the actual length is stored in the compiled code. */
1704    
1705     if (c == OP_XCLASS) code += GET(code, 1);
1706 ph10 461
1707 ph10 454 /* Handle recursion */
1708 ph10 461
1709 ph10 454 else if (c == OP_REVERSE)
1710     {
1711 ph10 461 if (number < 0) return (uschar *)code;
1712 ph10 454 code += _pcre_OP_lengths[c];
1713     }
1714 nigel 91
1715 nigel 93 /* Handle capturing bracket */
1716 nigel 91
1717 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1718     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1719 nigel 77 {
1720 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1721 nigel 77 if (n == number) return (uschar *)code;
1722 nigel 93 code += _pcre_OP_lengths[c];
1723 nigel 77 }
1724 nigel 91
1725 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1726     repeated character types, we have to test for \p and \P, which have an extra
1727 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1728 ph10 510 must add in its length. */
1729 nigel 91
1730 nigel 77 else
1731     {
1732 ph10 218 switch(c)
1733     {
1734     case OP_TYPESTAR:
1735     case OP_TYPEMINSTAR:
1736     case OP_TYPEPLUS:
1737     case OP_TYPEMINPLUS:
1738     case OP_TYPEQUERY:
1739     case OP_TYPEMINQUERY:
1740     case OP_TYPEPOSSTAR:
1741     case OP_TYPEPOSPLUS:
1742     case OP_TYPEPOSQUERY:
1743     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1744 ph10 220 break;
1745 ph10 221
1746     case OP_TYPEUPTO:
1747     case OP_TYPEMINUPTO:
1748     case OP_TYPEEXACT:
1749     case OP_TYPEPOSUPTO:
1750     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1751     break;
1752 ph10 512
1753 ph10 510 case OP_MARK:
1754     case OP_PRUNE_ARG:
1755     case OP_SKIP_ARG:
1756     code += code[1];
1757 ph10 512 break;
1758 ph10 550
1759     case OP_THEN_ARG:
1760     code += code[1+LINK_SIZE];
1761     break;
1762 ph10 220 }
1763    
1764 ph10 218 /* Add in the fixed length from the table */
1765 ph10 220
1766 nigel 77 code += _pcre_OP_lengths[c];
1767 ph10 220
1768 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1769     a multi-byte character. The length in the table is a minimum, so we have to
1770     arrange to skip the extra bytes. */
1771 ph10 220
1772 ph10 107 #ifdef SUPPORT_UTF8
1773 nigel 77 if (utf8) switch(c)
1774     {
1775     case OP_CHAR:
1776 ph10 602 case OP_CHARI:
1777 nigel 77 case OP_EXACT:
1778 ph10 602 case OP_EXACTI:
1779 nigel 77 case OP_UPTO:
1780 ph10 602 case OP_UPTOI:
1781 nigel 77 case OP_MINUPTO:
1782 ph10 602 case OP_MINUPTOI:
1783 nigel 93 case OP_POSUPTO:
1784 ph10 602 case OP_POSUPTOI:
1785 nigel 77 case OP_STAR:
1786 ph10 602 case OP_STARI:
1787 nigel 77 case OP_MINSTAR:
1788 ph10 602 case OP_MINSTARI:
1789 nigel 93 case OP_POSSTAR:
1790 ph10 602 case OP_POSSTARI:
1791 nigel 77 case OP_PLUS:
1792 ph10 602 case OP_PLUSI:
1793 nigel 77 case OP_MINPLUS:
1794 ph10 602 case OP_MINPLUSI:
1795 nigel 93 case OP_POSPLUS:
1796 ph10 602 case OP_POSPLUSI:
1797 nigel 77 case OP_QUERY:
1798 ph10 602 case OP_QUERYI:
1799 nigel 77 case OP_MINQUERY:
1800 ph10 602 case OP_MINQUERYI:
1801 nigel 93 case OP_POSQUERY:
1802 ph10 602 case OP_POSQUERYI:
1803 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1804 nigel 77 break;
1805     }
1806 ph10 369 #else
1807     (void)(utf8); /* Keep compiler happy by referencing function argument */
1808 ph10 111 #endif
1809 nigel 77 }
1810     }
1811     }
1812    
1813    
1814    
1815     /*************************************************
1816     * Scan compiled regex for recursion reference *
1817     *************************************************/
1818    
1819     /* This little function scans through a compiled pattern until it finds an
1820     instance of OP_RECURSE.
1821    
1822     Arguments:
1823     code points to start of expression
1824     utf8 TRUE in UTF-8 mode
1825    
1826     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1827     */
1828    
1829     static const uschar *
1830     find_recurse(const uschar *code, BOOL utf8)
1831     {
1832     for (;;)
1833     {
1834     register int c = *code;
1835     if (c == OP_END) return NULL;
1836 nigel 91 if (c == OP_RECURSE) return code;
1837 ph10 220
1838 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1839     map. This includes negated single high-valued characters. The length in
1840     the table is zero; the actual length is stored in the compiled code. */
1841    
1842     if (c == OP_XCLASS) code += GET(code, 1);
1843    
1844 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1845     repeated character types, we have to test for \p and \P, which have an extra
1846 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1847 ph10 510 must add in its length. */
1848 nigel 91
1849 nigel 77 else
1850     {
1851 ph10 218 switch(c)
1852     {
1853     case OP_TYPESTAR:
1854     case OP_TYPEMINSTAR:
1855     case OP_TYPEPLUS:
1856     case OP_TYPEMINPLUS:
1857     case OP_TYPEQUERY:
1858     case OP_TYPEMINQUERY:
1859     case OP_TYPEPOSSTAR:
1860     case OP_TYPEPOSPLUS:
1861     case OP_TYPEPOSQUERY:
1862     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1863 ph10 220 break;
1864 ph10 221
1865     case OP_TYPEPOSUPTO:
1866     case OP_TYPEUPTO:
1867     case OP_TYPEMINUPTO:
1868     case OP_TYPEEXACT:
1869     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1870     break;
1871 ph10 512
1872 ph10 510 case OP_MARK:
1873     case OP_PRUNE_ARG:
1874     case OP_SKIP_ARG:
1875     code += code[1];
1876 ph10 512 break;
1877 ph10 550
1878     case OP_THEN_ARG:
1879     code += code[1+LINK_SIZE];
1880     break;
1881 ph10 220 }
1882    
1883 ph10 218 /* Add in the fixed length from the table */
1884    
1885 nigel 77 code += _pcre_OP_lengths[c];
1886 ph10 220
1887 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1888     by a multi-byte character. The length in the table is a minimum, so we have
1889     to arrange to skip the extra bytes. */
1890 ph10 220
1891 ph10 107 #ifdef SUPPORT_UTF8
1892 nigel 77 if (utf8) switch(c)
1893     {
1894     case OP_CHAR:
1895 ph10 602 case OP_CHARI:
1896 nigel 77 case OP_EXACT:
1897 ph10 602 case OP_EXACTI:
1898 nigel 77 case OP_UPTO:
1899 ph10 602 case OP_UPTOI:
1900 nigel 77 case OP_MINUPTO:
1901 ph10 602 case OP_MINUPTOI:
1902 nigel 93 case OP_POSUPTO:
1903 ph10 602 case OP_POSUPTOI:
1904 nigel 77 case OP_STAR:
1905 ph10 602 case OP_STARI:
1906 nigel 77 case OP_MINSTAR:
1907 ph10 602 case OP_MINSTARI:
1908 nigel 93 case OP_POSSTAR:
1909 ph10 602 case OP_POSSTARI:
1910 nigel 77 case OP_PLUS:
1911 ph10 602 case OP_PLUSI:
1912 nigel 77 case OP_MINPLUS:
1913 ph10 602 case OP_MINPLUSI:
1914 nigel 93 case OP_POSPLUS:
1915 ph10 602 case OP_POSPLUSI:
1916 nigel 77 case OP_QUERY:
1917 ph10 602 case OP_QUERYI:
1918 nigel 77 case OP_MINQUERY:
1919 ph10 602 case OP_MINQUERYI:
1920 nigel 93 case OP_POSQUERY:
1921 ph10 602 case OP_POSQUERYI:
1922 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1923 nigel 77 break;
1924     }
1925 ph10 369 #else
1926     (void)(utf8); /* Keep compiler happy by referencing function argument */
1927 ph10 111 #endif
1928 nigel 77 }
1929     }
1930     }
1931    
1932    
1933    
1934     /*************************************************
1935     * Scan compiled branch for non-emptiness *
1936     *************************************************/
1937    
1938     /* This function scans through a branch of a compiled pattern to see whether it
1939 nigel 93 can match the empty string or not. It is called from could_be_empty()
1940     below and from compile_branch() when checking for an unlimited repeat of a
1941     group that can match nothing. Note that first_significant_code() skips over
1942 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1943     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1944     bracket whose current branch will already have been scanned.
1945 nigel 77
1946     Arguments:
1947     code points to start of search
1948     endcode points to where to stop
1949     utf8 TRUE if in UTF8 mode
1950 ph10 503 cd contains pointers to tables etc.
1951 nigel 77
1952     Returns: TRUE if what is matched could be empty
1953     */
1954    
1955     static BOOL
1956 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1957     compile_data *cd)
1958 nigel 77 {
1959     register int c;
1960 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1961 nigel 77 code < endcode;
1962 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1963 nigel 77 {
1964     const uschar *ccode;
1965    
1966     c = *code;
1967 ph10 507
1968 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1969 ph10 282 first_significant_code() with a TRUE final argument. */
1970 ph10 286
1971 ph10 282 if (c == OP_ASSERT)
1972 ph10 286 {
1973 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1974     c = *code;
1975     continue;
1976 ph10 286 }
1977 ph10 172
1978 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1979 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
1980     forward reference subroutine call, we can't. To detect forward reference
1981     we have to scan up the list that is kept in the workspace. This function is
1982     called only when doing the real compile, not during the pre-compile that
1983     measures the size of the compiled pattern. */
1984 ph10 507
1985 ph10 503 if (c == OP_RECURSE)
1986     {
1987 ph10 624 const uschar *scode;
1988     BOOL empty_branch;
1989    
1990     /* Test for forward reference */
1991    
1992     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
1993     if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
1994    
1995     /* Not a forward reference, test for completed backward reference */
1996    
1997     empty_branch = FALSE;
1998     scode = cd->start_code + GET(code, 1);
1999 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2000 ph10 624
2001     /* Completed backwards reference */
2002    
2003 ph10 503 do
2004     {
2005 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2006     {
2007     empty_branch = TRUE;
2008 ph10 507 break;
2009     }
2010 ph10 503 scode += GET(scode, 1);
2011     }
2012     while (*scode == OP_ALT);
2013 ph10 624
2014 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2015 ph10 503 continue;
2016 ph10 507 }
2017 ph10 170
2018 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2019    
2020     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2021     c == OP_BRAPOSZERO)
2022     {
2023     code += _pcre_OP_lengths[c];
2024     do code += GET(code, 1); while (*code == OP_ALT);
2025     c = *code;
2026     continue;
2027     }
2028    
2029     /* A nested group that is already marked as "could be empty" can just be
2030     skipped. */
2031    
2032     if (c == OP_SBRA || c == OP_SBRAPOS ||
2033     c == OP_SCBRA || c == OP_SCBRAPOS)
2034     {
2035     do code += GET(code, 1); while (*code == OP_ALT);
2036     c = *code;
2037     continue;
2038     }
2039    
2040 ph10 170 /* For other groups, scan the branches. */
2041 ph10 172
2042 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2043     c == OP_CBRA || c == OP_CBRAPOS ||
2044     c == OP_ONCE || c == OP_COND)
2045 nigel 77 {
2046     BOOL empty_branch;
2047     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2048 ph10 406
2049     /* If a conditional group has only one branch, there is a second, implied,
2050 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2051     Otherwise, scan the individual branches of the group. */
2052 ph10 406
2053 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2054 nigel 77 code += GET(code, 1);
2055 ph10 395 else
2056 ph10 406 {
2057 ph10 395 empty_branch = FALSE;
2058     do
2059     {
2060 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2061 ph10 395 empty_branch = TRUE;
2062     code += GET(code, 1);
2063     }
2064     while (*code == OP_ALT);
2065     if (!empty_branch) return FALSE; /* All branches are non-empty */
2066 nigel 77 }
2067 ph10 406
2068 ph10 172 c = *code;
2069 nigel 93 continue;
2070 nigel 77 }
2071    
2072 nigel 93 /* Handle the other opcodes */
2073    
2074     switch (c)
2075 nigel 77 {
2076 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2077     cannot be represented just by a bit map. This includes negated single
2078     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2079 ph10 220 actual length is stored in the compiled code, so we must update "code"
2080 ph10 216 here. */
2081 nigel 77
2082     #ifdef SUPPORT_UTF8
2083     case OP_XCLASS:
2084 ph10 216 ccode = code += GET(code, 1);
2085 nigel 77 goto CHECK_CLASS_REPEAT;
2086     #endif
2087    
2088     case OP_CLASS:
2089     case OP_NCLASS:
2090     ccode = code + 33;
2091    
2092     #ifdef SUPPORT_UTF8
2093     CHECK_CLASS_REPEAT:
2094     #endif
2095    
2096     switch (*ccode)
2097     {
2098     case OP_CRSTAR: /* These could be empty; continue */
2099     case OP_CRMINSTAR:
2100     case OP_CRQUERY:
2101     case OP_CRMINQUERY:
2102     break;
2103    
2104     default: /* Non-repeat => class must match */
2105     case OP_CRPLUS: /* These repeats aren't empty */
2106     case OP_CRMINPLUS:
2107     return FALSE;
2108    
2109     case OP_CRRANGE:
2110     case OP_CRMINRANGE:
2111     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2112     break;
2113     }
2114     break;
2115    
2116     /* Opcodes that must match a character */
2117    
2118     case OP_PROP:
2119     case OP_NOTPROP:
2120     case OP_EXTUNI:
2121     case OP_NOT_DIGIT:
2122     case OP_DIGIT:
2123     case OP_NOT_WHITESPACE:
2124     case OP_WHITESPACE:
2125     case OP_NOT_WORDCHAR:
2126     case OP_WORDCHAR:
2127     case OP_ANY:
2128 ph10 345 case OP_ALLANY:
2129 nigel 77 case OP_ANYBYTE:
2130     case OP_CHAR:
2131 ph10 602 case OP_CHARI:
2132 nigel 77 case OP_NOT:
2133 ph10 602 case OP_NOTI:
2134 nigel 77 case OP_PLUS:
2135     case OP_MINPLUS:
2136 nigel 93 case OP_POSPLUS:
2137 nigel 77 case OP_EXACT:
2138     case OP_NOTPLUS:
2139     case OP_NOTMINPLUS:
2140 nigel 93 case OP_NOTPOSPLUS:
2141 nigel 77 case OP_NOTEXACT:
2142     case OP_TYPEPLUS:
2143     case OP_TYPEMINPLUS:
2144 nigel 93 case OP_TYPEPOSPLUS:
2145 nigel 77 case OP_TYPEEXACT:
2146     return FALSE;
2147 ph10 227
2148     /* These are going to continue, as they may be empty, but we have to
2149     fudge the length for the \p and \P cases. */
2150    
2151 ph10 224 case OP_TYPESTAR:
2152     case OP_TYPEMINSTAR:
2153     case OP_TYPEPOSSTAR:
2154     case OP_TYPEQUERY:
2155     case OP_TYPEMINQUERY:
2156     case OP_TYPEPOSQUERY:
2157     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2158 ph10 227 break;
2159    
2160 ph10 224 /* Same for these */
2161 ph10 227
2162 ph10 224 case OP_TYPEUPTO:
2163     case OP_TYPEMINUPTO:
2164     case OP_TYPEPOSUPTO:
2165     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2166     break;
2167 nigel 77
2168     /* End of branch */
2169    
2170     case OP_KET:
2171     case OP_KETRMAX:
2172     case OP_KETRMIN:
2173 ph10 604 case OP_KETRPOS:
2174 nigel 77 case OP_ALT:
2175     return TRUE;
2176    
2177 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2178     MINUPTO, and POSUPTO may be followed by a multibyte character */
2179 nigel 77
2180     #ifdef SUPPORT_UTF8
2181     case OP_STAR:
2182 ph10 602 case OP_STARI:
2183 nigel 77 case OP_MINSTAR:
2184 ph10 602 case OP_MINSTARI:
2185 nigel 93 case OP_POSSTAR:
2186 ph10 602 case OP_POSSTARI:
2187 nigel 77 case OP_QUERY:
2188 ph10 602 case OP_QUERYI:
2189 nigel 77 case OP_MINQUERY:
2190 ph10 602 case OP_MINQUERYI:
2191 nigel 93 case OP_POSQUERY:
2192 ph10 602 case OP_POSQUERYI:
2193 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2194     break;
2195 ph10 461
2196 nigel 77 case OP_UPTO:
2197 ph10 602 case OP_UPTOI:
2198 nigel 77 case OP_MINUPTO:
2199 ph10 602 case OP_MINUPTOI:
2200 nigel 93 case OP_POSUPTO:
2201 ph10 602 case OP_POSUPTOI:
2202 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2203 nigel 77 break;
2204     #endif
2205 ph10 503
2206 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2207     string. */
2208    
2209     case OP_MARK:
2210     case OP_PRUNE_ARG:
2211     case OP_SKIP_ARG:
2212     code += code[1];
2213 ph10 512 break;
2214 ph10 510
2215 ph10 550 case OP_THEN_ARG:
2216     code += code[1+LINK_SIZE];
2217     break;
2218    
2219 ph10 503 /* None of the remaining opcodes are required to match a character. */
2220 ph10 507
2221 ph10 503 default:
2222 ph10 507 break;
2223 nigel 77 }
2224     }
2225    
2226     return TRUE;
2227     }
2228    
2229    
2230    
2231     /*************************************************
2232     * Scan compiled regex for non-emptiness *
2233     *************************************************/
2234    
2235     /* This function is called to check for left recursive calls. We want to check
2236     the current branch of the current pattern to see if it could match the empty
2237     string. If it could, we must look outwards for branches at other levels,
2238     stopping when we pass beyond the bracket which is the subject of the recursion.
2239 ph10 624 This function is called only during the real compile, not during the
2240     pre-compile.
2241 nigel 77
2242     Arguments:
2243     code points to start of the recursion
2244     endcode points to where to stop (current RECURSE item)
2245     bcptr points to the chain of current (unclosed) branch starts
2246     utf8 TRUE if in UTF-8 mode
2247 ph10 507 cd pointers to tables etc
2248 nigel 77
2249     Returns: TRUE if what is matched could be empty
2250     */
2251    
2252     static BOOL
2253     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2254 ph10 503 BOOL utf8, compile_data *cd)
2255 nigel 77 {
2256 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2257 nigel 77 {
2258 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2259 ph10 475 return FALSE;
2260 nigel 77 bcptr = bcptr->outer;
2261     }
2262     return TRUE;
2263     }
2264    
2265    
2266    
2267     /*************************************************
2268     * Check for POSIX class syntax *
2269     *************************************************/
2270    
2271     /* This function is called when the sequence "[:" or "[." or "[=" is
2272 ph10 295 encountered in a character class. It checks whether this is followed by a
2273 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2274 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2275 nigel 77
2276 ph10 298 Originally, this function only recognized a sequence of letters between the
2277     terminators, but it seems that Perl recognizes any sequence of characters,
2278     though of course unknown POSIX names are subsequently rejected. Perl gives an
2279     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2280     didn't consider this to be a POSIX class. Likewise for [:1234:].
2281 ph10 295
2282 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2283     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2284     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2285     below handles the special case of \], but does not try to do any other escape
2286     processing. This makes it different from Perl for cases such as [:l\ower:]
2287 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2288 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2289 ph10 295 I think.
2290    
2291     Arguments:
2292 nigel 77 ptr pointer to the initial [
2293     endptr where to return the end pointer
2294    
2295     Returns: TRUE or FALSE
2296     */
2297    
2298     static BOOL
2299 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2300 nigel 77 {
2301     int terminator; /* Don't combine these lines; the Solaris cc */
2302     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2303 ph10 295 for (++ptr; *ptr != 0; ptr++)
2304 nigel 77 {
2305 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2306 ph10 298 {
2307 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2308     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2309 ph10 295 {
2310     *endptr = ptr;
2311     return TRUE;
2312 ph10 298 }
2313     }
2314     }
2315 nigel 77 return FALSE;
2316     }
2317    
2318    
2319    
2320    
2321     /*************************************************
2322     * Check POSIX class name *
2323     *************************************************/
2324    
2325     /* This function is called to check the name given in a POSIX-style class entry
2326     such as [:alnum:].
2327    
2328     Arguments:
2329     ptr points to the first letter
2330     len the length of the name
2331    
2332     Returns: a value representing the name, or -1 if unknown
2333     */
2334    
2335     static int
2336     check_posix_name(const uschar *ptr, int len)
2337     {
2338 ph10 240 const char *pn = posix_names;
2339 nigel 77 register int yield = 0;
2340     while (posix_name_lengths[yield] != 0)
2341     {
2342     if (len == posix_name_lengths[yield] &&
2343 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2344 ph10 243 pn += posix_name_lengths[yield] + 1;
2345 nigel 77 yield++;
2346     }
2347     return -1;
2348     }
2349    
2350    
2351     /*************************************************
2352     * Adjust OP_RECURSE items in repeated group *
2353     *************************************************/
2354    
2355     /* OP_RECURSE items contain an offset from the start of the regex to the group
2356     that is referenced. This means that groups can be replicated for fixed
2357     repetition simply by copying (because the recursion is allowed to refer to
2358     earlier groups that are outside the current group). However, when a group is
2359 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2360     inserted before it, after it has been compiled. This means that any OP_RECURSE
2361     items within it that refer to the group itself or any contained groups have to
2362     have their offsets adjusted. That one of the jobs of this function. Before it
2363     is called, the partially compiled regex must be temporarily terminated with
2364     OP_END.
2365 nigel 77
2366 nigel 93 This function has been extended with the possibility of forward references for
2367     recursions and subroutine calls. It must also check the list of such references
2368     for the group we are dealing with. If it finds that one of the recursions in
2369     the current group is on this list, it adjusts the offset in the list, not the
2370     value in the reference (which is a group number).
2371    
2372 nigel 77 Arguments:
2373     group points to the start of the group
2374     adjust the amount by which the group is to be moved
2375     utf8 TRUE in UTF-8 mode
2376     cd contains pointers to tables etc.
2377 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2378 nigel 77
2379     Returns: nothing
2380     */
2381    
2382     static void
2383 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2384     uschar *save_hwm)
2385 nigel 77 {
2386     uschar *ptr = group;
2387 ph10 224
2388 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2389     {
2390 nigel 93 int offset;
2391     uschar *hc;
2392    
2393     /* See if this recursion is on the forward reference list. If so, adjust the
2394     reference. */
2395 ph10 345
2396 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2397     {
2398     offset = GET(hc, 0);
2399     if (cd->start_code + offset == ptr + 1)
2400     {
2401     PUT(hc, 0, offset + adjust);
2402     break;
2403     }
2404     }
2405    
2406     /* Otherwise, adjust the recursion offset if it's after the start of this
2407     group. */
2408    
2409     if (hc >= cd->hwm)
2410     {
2411     offset = GET(ptr, 1);
2412     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2413     }
2414    
2415 nigel 77 ptr += 1 + LINK_SIZE;
2416     }
2417     }
2418    
2419    
2420    
2421     /*************************************************
2422     * Insert an automatic callout point *
2423     *************************************************/
2424    
2425     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2426     callout points before each pattern item.
2427    
2428     Arguments:
2429     code current code pointer
2430     ptr current pattern pointer
2431     cd pointers to tables etc
2432    
2433     Returns: new code pointer
2434     */
2435    
2436     static uschar *
2437     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2438     {
2439     *code++ = OP_CALLOUT;
2440     *code++ = 255;
2441 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2442     PUT(code, LINK_SIZE, 0); /* Default length */
2443 nigel 77 return code + 2*LINK_SIZE;
2444     }
2445    
2446    
2447    
2448     /*************************************************
2449     * Complete a callout item *
2450     *************************************************/
2451    
2452     /* A callout item contains the length of the next item in the pattern, which
2453     we can't fill in till after we have reached the relevant point. This is used
2454     for both automatic and manual callouts.
2455    
2456     Arguments:
2457     previous_callout points to previous callout item
2458     ptr current pattern pointer
2459     cd pointers to tables etc
2460    
2461     Returns: nothing
2462     */
2463    
2464     static void
2465     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2466     {
2467 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2468 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2469     }
2470    
2471    
2472    
2473     #ifdef SUPPORT_UCP
2474     /*************************************************
2475     * Get othercase range *
2476     *************************************************/
2477    
2478     /* This function is passed the start and end of a class range, in UTF-8 mode
2479     with UCP support. It searches up the characters, looking for internal ranges of
2480     characters in the "other" case. Each call returns the next one, updating the
2481     start address.
2482    
2483     Arguments:
2484     cptr points to starting character value; updated
2485     d end value
2486     ocptr where to put start of othercase range
2487     odptr where to put end of othercase range
2488    
2489     Yield: TRUE when range returned; FALSE when no more
2490     */
2491    
2492     static BOOL
2493 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2494     unsigned int *odptr)
2495 nigel 77 {
2496 nigel 93 unsigned int c, othercase, next;
2497 nigel 77
2498     for (c = *cptr; c <= d; c++)
2499 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2500 nigel 77
2501     if (c > d) return FALSE;
2502    
2503     *ocptr = othercase;
2504     next = othercase + 1;
2505    
2506     for (++c; c <= d; c++)
2507     {
2508 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2509 nigel 77 next++;
2510     }
2511    
2512     *odptr = next - 1;
2513     *cptr = c;
2514    
2515     return TRUE;
2516     }
2517 ph10 532
2518    
2519    
2520     /*************************************************
2521     * Check a character and a property *
2522     *************************************************/
2523    
2524     /* This function is called by check_auto_possessive() when a property item
2525     is adjacent to a fixed character.
2526    
2527     Arguments:
2528     c the character
2529     ptype the property type
2530     pdata the data for the type
2531     negated TRUE if it's a negated property (\P or \p{^)
2532 ph10 535
2533 ph10 532 Returns: TRUE if auto-possessifying is OK
2534 ph10 535 */
2535 ph10 532
2536     static BOOL
2537     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2538     {
2539     const ucd_record *prop = GET_UCD(c);
2540     switch(ptype)
2541     {
2542     case PT_LAMP:
2543     return (prop->chartype == ucp_Lu ||
2544     prop->chartype == ucp_Ll ||
2545     prop->chartype == ucp_Lt) == negated;
2546    
2547     case PT_GC:
2548     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2549    
2550     case PT_PC:
2551     return (pdata == prop->chartype) == negated;
2552    
2553     case PT_SC:
2554     return (pdata == prop->script) == negated;
2555    
2556     /* These are specials */
2557    
2558     case PT_ALNUM:
2559     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2560     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2561    
2562     case PT_SPACE: /* Perl space */
2563     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2564     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2565     == negated;
2566    
2567     case PT_PXSPACE: /* POSIX space */
2568     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2569     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2570     c == CHAR_FF || c == CHAR_CR)
2571     == negated;
2572    
2573     case PT_WORD:
2574     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2575     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2576     c == CHAR_UNDERSCORE) == negated;
2577     }
2578 ph10 535 return FALSE;
2579 ph10 532 }
2580 nigel 77 #endif /* SUPPORT_UCP */
2581    
2582    
2583 nigel 93
2584 nigel 77 /*************************************************
2585 nigel 93 * Check if auto-possessifying is possible *
2586     *************************************************/
2587    
2588     /* This function is called for unlimited repeats of certain items, to see
2589     whether the next thing could possibly match the repeated item. If not, it makes
2590     sense to automatically possessify the repeated item.
2591    
2592     Arguments:
2593 ph10 532 previous pointer to the repeated opcode
2594 nigel 93 utf8 TRUE in UTF-8 mode
2595     ptr next character in pattern
2596     options options bits
2597     cd contains pointers to tables etc.
2598    
2599     Returns: TRUE if possessifying is wanted
2600     */
2601    
2602     static BOOL
2603 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2604 ph10 532 int options, compile_data *cd)
2605 nigel 93 {
2606 ph10 532 int c, next;
2607     int op_code = *previous++;
2608 nigel 93
2609     /* Skip whitespace and comments in extended mode */
2610    
2611     if ((options & PCRE_EXTENDED) != 0)
2612     {
2613     for (;;)
2614     {
2615     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2616 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2617 nigel 93 {
2618 ph10 579 ptr++;
2619 ph10 556 while (*ptr != 0)
2620     {
2621 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2622 ph10 556 ptr++;
2623 ph10 579 #ifdef SUPPORT_UTF8
2624 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2625     #endif
2626     }
2627 nigel 93 }
2628     else break;
2629     }
2630     }
2631    
2632     /* If the next item is one that we can handle, get its value. A non-negative
2633     value is a character, a negative value is an escape value. */
2634    
2635 ph10 391 if (*ptr == CHAR_BACKSLASH)
2636 nigel 93 {
2637     int temperrorcode = 0;
2638     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2639     if (temperrorcode != 0) return FALSE;
2640     ptr++; /* Point after the escape sequence */
2641     }
2642    
2643     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2644     {
2645     #ifdef SUPPORT_UTF8
2646     if (utf8) { GETCHARINC(next, ptr); } else
2647     #endif
2648     next = *ptr++;
2649     }
2650    
2651     else return FALSE;
2652    
2653     /* Skip whitespace and comments in extended mode */
2654    
2655     if ((options & PCRE_EXTENDED) != 0)
2656     {
2657     for (;;)
2658     {
2659     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2660 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2661 nigel 93 {
2662 ph10 579 ptr++;
2663 ph10 556 while (*ptr != 0)
2664     {
2665 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2666 ph10 556 ptr++;
2667 ph10 579 #ifdef SUPPORT_UTF8
2668 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2669     #endif
2670     }
2671 nigel 93 }
2672     else break;
2673     }
2674     }
2675    
2676     /* If the next thing is itself optional, we have to give up. */
2677    
2678 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2679 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2680     return FALSE;
2681 nigel 93
2682 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2683     the next item is a character. */
2684 nigel 93
2685     if (next >= 0) switch(op_code)
2686     {
2687     case OP_CHAR:
2688 ph10 535 #ifdef SUPPORT_UTF8
2689 ph10 532 GETCHARTEST(c, previous);
2690 ph10 369 #else
2691 ph10 532 c = *previous;
2692 ph10 535 #endif
2693     return c != next;
2694 nigel 93
2695 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2696 nigel 93 Unicode property support, we can use it to test the other case of
2697     high-valued characters. */
2698    
2699 ph10 602 case OP_CHARI:
2700 ph10 535 #ifdef SUPPORT_UTF8
2701 ph10 532 GETCHARTEST(c, previous);
2702     #else
2703     c = *previous;
2704 ph10 535 #endif
2705 ph10 532 if (c == next) return FALSE;
2706 nigel 93 #ifdef SUPPORT_UTF8
2707     if (utf8)
2708     {
2709     unsigned int othercase;
2710     if (next < 128) othercase = cd->fcc[next]; else
2711     #ifdef SUPPORT_UCP
2712 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2713 nigel 93 #else
2714     othercase = NOTACHAR;
2715     #endif
2716 ph10 532 return (unsigned int)c != othercase;
2717 nigel 93 }
2718     else
2719     #endif /* SUPPORT_UTF8 */
2720 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2721 nigel 93
2722 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2723 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2724 ph10 602 an XCLASS instead. */
2725 nigel 93
2726     case OP_NOT:
2727 ph10 602 return (c = *previous) == next;
2728 ph10 604
2729     case OP_NOTI:
2730 ph10 532 if ((c = *previous) == next) return TRUE;
2731 nigel 93 #ifdef SUPPORT_UTF8
2732     if (utf8)
2733     {
2734     unsigned int othercase;
2735     if (next < 128) othercase = cd->fcc[next]; else
2736     #ifdef SUPPORT_UCP
2737 ph10 349 othercase = UCD_OTHERCASE(next);
2738 nigel 93 #else
2739     othercase = NOTACHAR;
2740     #endif
2741 ph10 532 return (unsigned int)c == othercase;
2742 nigel 93 }
2743     else
2744     #endif /* SUPPORT_UTF8 */
2745 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2746 nigel 93
2747 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2748     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2749    
2750 nigel 93 case OP_DIGIT:
2751     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2752    
2753     case OP_NOT_DIGIT:
2754     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2755    
2756     case OP_WHITESPACE:
2757     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2758    
2759     case OP_NOT_WHITESPACE:
2760     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2761    
2762     case OP_WORDCHAR:
2763     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2764    
2765     case OP_NOT_WORDCHAR:
2766     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2767    
2768 ph10 180 case OP_HSPACE:
2769     case OP_NOT_HSPACE:
2770     switch(next)
2771     {
2772     case 0x09:
2773     case 0x20:
2774     case 0xa0:
2775     case 0x1680:
2776     case 0x180e:
2777     case 0x2000:
2778     case 0x2001:
2779     case 0x2002:
2780     case 0x2003:
2781     case 0x2004:
2782     case 0x2005:
2783     case 0x2006:
2784     case 0x2007:
2785     case 0x2008:
2786     case 0x2009:
2787     case 0x200A:
2788     case 0x202f:
2789     case 0x205f:
2790     case 0x3000:
2791 ph10 528 return op_code == OP_NOT_HSPACE;
2792 ph10 180 default:
2793 ph10 528 return op_code != OP_NOT_HSPACE;
2794 ph10 180 }
2795    
2796 ph10 528 case OP_ANYNL:
2797 ph10 180 case OP_VSPACE:
2798     case OP_NOT_VSPACE:
2799     switch(next)
2800     {
2801     case 0x0a:
2802     case 0x0b:
2803     case 0x0c:
2804     case 0x0d:
2805     case 0x85:
2806     case 0x2028:
2807     case 0x2029:
2808 ph10 528 return op_code == OP_NOT_VSPACE;
2809 ph10 180 default:
2810 ph10 528 return op_code != OP_NOT_VSPACE;
2811 ph10 180 }
2812    
2813 ph10 532 #ifdef SUPPORT_UCP
2814     case OP_PROP:
2815     return check_char_prop(next, previous[0], previous[1], FALSE);
2816 ph10 535
2817 ph10 532 case OP_NOTPROP:
2818     return check_char_prop(next, previous[0], previous[1], TRUE);
2819     #endif
2820    
2821 nigel 93 default:
2822     return FALSE;
2823     }
2824    
2825    
2826 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2827     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2828     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2829     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2830 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2831 nigel 93
2832     switch(op_code)
2833     {
2834     case OP_CHAR:
2835 ph10 602 case OP_CHARI:
2836 ph10 535 #ifdef SUPPORT_UTF8
2837 ph10 532 GETCHARTEST(c, previous);
2838     #else
2839     c = *previous;
2840 ph10 535 #endif
2841 nigel 93 switch(-next)
2842     {
2843     case ESC_d:
2844 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2845 nigel 93
2846     case ESC_D:
2847 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2848 nigel 93
2849     case ESC_s:
2850 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2851 nigel 93
2852     case ESC_S:
2853 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2854 nigel 93
2855     case ESC_w:
2856 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2857 nigel 93
2858     case ESC_W:
2859 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2860 ph10 182
2861 ph10 180 case ESC_h:
2862     case ESC_H:
2863 ph10 532 switch(c)
2864 ph10 180 {
2865     case 0x09:
2866     case 0x20:
2867     case 0xa0:
2868     case 0x1680:
2869     case 0x180e:
2870     case 0x2000:
2871     case 0x2001:
2872     case 0x2002:
2873     case 0x2003:
2874     case 0x2004:
2875     case 0x2005:
2876     case 0x2006:
2877     case 0x2007:
2878     case 0x2008:
2879     case 0x2009:
2880     case 0x200A:
2881     case 0x202f:
2882     case 0x205f:
2883     case 0x3000:
2884     return -next != ESC_h;
2885     default:
2886     return -next == ESC_h;
2887 ph10 182 }
2888    
2889 ph10 180 case ESC_v:
2890     case ESC_V:
2891 ph10 532 switch(c)
2892 ph10 180 {
2893     case 0x0a:
2894     case 0x0b:
2895     case 0x0c:
2896     case 0x0d:
2897     case 0x85:
2898     case 0x2028:
2899     case 0x2029:
2900     return -next != ESC_v;
2901     default:
2902     return -next == ESC_v;
2903 ph10 182 }
2904 ph10 535
2905     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2906     their substitutions and process them. The result will always be either
2907 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2908 ph10 535
2909 ph10 532 #ifdef SUPPORT_UCP
2910     case ESC_du:
2911     case ESC_DU:
2912     case ESC_wu:
2913     case ESC_WU:
2914     case ESC_su:
2915     case ESC_SU:
2916     {
2917     int temperrorcode = 0;
2918     ptr = substitutes[-next - ESC_DU];
2919     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2920     if (temperrorcode != 0) return FALSE;
2921     ptr++; /* For compatibility */
2922     }
2923 ph10 535 /* Fall through */
2924 nigel 93
2925 ph10 532 case ESC_p:
2926     case ESC_P:
2927     {
2928     int ptype, pdata, errorcodeptr;
2929 ph10 535 BOOL negated;
2930    
2931 ph10 532 ptr--; /* Make ptr point at the p or P */
2932     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2933     if (ptype < 0) return FALSE;
2934     ptr++; /* Point past the final curly ket */
2935 ph10 535
2936 ph10 532 /* If the property item is optional, we have to give up. (When generated
2937     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2938     to the original \d etc. At this point, ptr will point to a zero byte. */
2939 ph10 535
2940 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2941     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2942     return FALSE;
2943 ph10 535
2944 ph10 532 /* Do the property check. */
2945 ph10 535
2946 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2947 ph10 535 }
2948 ph10 532 #endif
2949    
2950 nigel 93 default:
2951     return FALSE;
2952     }
2953    
2954 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2955     well. It means re-organizing the above code so as to get hold of the property
2956     values before switching on the op-code. However, I wonder how many patterns
2957     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2958     these op-codes are never generated.) */
2959    
2960 nigel 93 case OP_DIGIT:
2961 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2962 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2963 nigel 93
2964     case OP_NOT_DIGIT:
2965     return next == -ESC_d;
2966    
2967     case OP_WHITESPACE:
2968 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2969 nigel 93
2970     case OP_NOT_WHITESPACE:
2971 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2972 nigel 93
2973 ph10 180 case OP_HSPACE:
2974 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2975 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2976 ph10 180
2977     case OP_NOT_HSPACE:
2978     return next == -ESC_h;
2979 ph10 182
2980 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2981 ph10 535 case OP_ANYNL:
2982 ph10 182 case OP_VSPACE:
2983 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2984    
2985     case OP_NOT_VSPACE:
2986 ph10 528 return next == -ESC_v || next == -ESC_R;
2987 ph10 180
2988 nigel 93 case OP_WORDCHAR:
2989 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2990 ph10 528 next == -ESC_v || next == -ESC_R;
2991 nigel 93
2992     case OP_NOT_WORDCHAR:
2993     return next == -ESC_w || next == -ESC_d;
2994 ph10 182
2995 nigel 93 default:
2996     return FALSE;
2997     }
2998    
2999     /* Control does not reach here */
3000     }
3001    
3002    
3003    
3004     /*************************************************
3005 nigel 77 * Compile one branch *
3006     *************************************************/
3007    
3008 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3009 nigel 77 changed during the branch, the pointer is used to change the external options
3010 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3011     to find out the amount of memory needed, as well as during the real compile
3012     phase. The value of lengthptr distinguishes the two phases.
3013 nigel 77
3014     Arguments:
3015     optionsptr pointer to the option bits
3016     codeptr points to the pointer to the current code point
3017     ptrptr points to the current pattern pointer
3018     errorcodeptr points to error code variable
3019     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3020     reqbyteptr set to the last literal character required, else < 0
3021     bcptr points to current branch chain
3022     cd contains pointers to tables etc.
3023 nigel 93 lengthptr NULL during the real compile phase
3024     points to length accumulator during pre-compile phase
3025 nigel 77
3026     Returns: TRUE on success
3027     FALSE, with *errorcodeptr set non-zero on error
3028     */
3029    
3030     static BOOL
3031 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3032     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3033     compile_data *cd, int *lengthptr)
3034 nigel 77 {
3035     int repeat_type, op_type;
3036     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3037     int bravalue = 0;
3038     int greedy_default, greedy_non_default;
3039     int firstbyte, reqbyte;
3040     int zeroreqbyte, zerofirstbyte;
3041     int req_caseopt, reqvary, tempreqvary;
3042     int options = *optionsptr;
3043     int after_manual_callout = 0;
3044 nigel 93 int length_prevgroup = 0;
3045 nigel 77 register int c;
3046     register uschar *code = *codeptr;
3047 nigel 93 uschar *last_code = code;
3048     uschar *orig_code = code;
3049 nigel 77 uschar *tempcode;
3050     BOOL inescq = FALSE;
3051     BOOL groupsetfirstbyte = FALSE;
3052     const uschar *ptr = *ptrptr;
3053     const uschar *tempptr;
3054 ph10 518 const uschar *nestptr = NULL;
3055 nigel 77 uschar *previous = NULL;
3056     uschar *previous_callout = NULL;
3057 nigel 93 uschar *save_hwm = NULL;
3058 nigel 77 uschar classbits[32];
3059    
3060     #ifdef SUPPORT_UTF8
3061     BOOL class_utf8;
3062     BOOL utf8 = (options & PCRE_UTF8) != 0;
3063     uschar *class_utf8data;
3064 ph10 300 uschar *class_utf8data_base;
3065 nigel 77 uschar utf8_char[6];
3066     #else
3067     BOOL utf8 = FALSE;
3068 nigel 93 uschar *utf8_char = NULL;
3069 nigel 77 #endif
3070    
3071 ph10 475 #ifdef PCRE_DEBUG
3072 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3073     #endif
3074    
3075 nigel 77 /* Set up the default and non-default settings for greediness */
3076    
3077     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3078     greedy_non_default = greedy_default ^ 1;
3079    
3080     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3081     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3082     matches a non-fixed char first char; reqbyte just remains unset if we never
3083     find one.
3084    
3085     When we hit a repeat whose minimum is zero, we may have to adjust these values
3086     to take the zero repeat into account. This is implemented by setting them to
3087     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3088     item types that can be repeated set these backoff variables appropriately. */
3089    
3090     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3091    
3092     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3093     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3094     value > 255. It is added into the firstbyte or reqbyte variables to record the
3095     case status of the value. This is used only for ASCII characters. */
3096    
3097     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3098    
3099     /* Switch on next character until the end of the branch */
3100    
3101     for (;; ptr++)
3102     {
3103     BOOL negate_class;
3104 ph10 286 BOOL should_flip_negation;
3105 nigel 77 BOOL possessive_quantifier;
3106     BOOL is_quantifier;
3107 nigel 93 BOOL is_recurse;
3108 ph10 180 BOOL reset_bracount;
3109 nigel 77 int class_charcount;
3110     int class_lastchar;
3111     int newoptions;
3112     int recno;
3113 ph10 172 int refsign;
3114 nigel 77 int skipbytes;
3115     int subreqbyte;
3116     int subfirstbyte;
3117 nigel 93 int terminator;
3118 nigel 77 int mclength;
3119     uschar mcbuffer[8];
3120    
3121 nigel 93 /* Get next byte in the pattern */
3122 nigel 77
3123     c = *ptr;
3124 ph10 345
3125 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3126 ph10 518 string. Nesting only happens one level deep. */
3127    
3128     if (c == 0 && nestptr != NULL)
3129     {
3130     ptr = nestptr;
3131     nestptr = NULL;
3132     c = *ptr;
3133     }
3134    
3135 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3136     previous cycle of this loop. */
3137    
3138     if (lengthptr != NULL)
3139     {
3140 ph10 475 #ifdef PCRE_DEBUG
3141 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3142     #endif
3143 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3144 nigel 93 {
3145     *errorcodeptr = ERR52;
3146     goto FAILED;
3147     }
3148    
3149     /* There is at least one situation where code goes backwards: this is the
3150     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3151     the class is simply eliminated. However, it is created first, so we have to
3152     allow memory for it. Therefore, don't ever reduce the length at this point.
3153     */
3154    
3155     if (code < last_code) code = last_code;
3156 ph10 202
3157     /* Paranoid check for integer overflow */
3158    
3159     if (OFLOW_MAX - *lengthptr < code - last_code)
3160     {
3161     *errorcodeptr = ERR20;
3162     goto FAILED;
3163     }
3164    
3165 ph10 530 *lengthptr += (int)(code - last_code);
3166 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3167    
3168     /* If "previous" is set and it is not at the start of the work space, move
3169     it back to there, in order to avoid filling up the work space. Otherwise,
3170     if "previous" is NULL, reset the current code pointer to the start. */
3171    
3172     if (previous != NULL)
3173     {
3174     if (previous > orig_code)
3175     {
3176     memmove(orig_code, previous, code - previous);
3177     code -= previous - orig_code;
3178     previous = orig_code;
3179     }
3180     }
3181     else code = orig_code;
3182    
3183     /* Remember where this code item starts so we can pick up the length
3184     next time round. */
3185    
3186     last_code = code;
3187     }
3188    
3189     /* In the real compile phase, just check the workspace used by the forward
3190     reference list. */
3191    
3192 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3193 nigel 93 {
3194     *errorcodeptr = ERR52;
3195     goto FAILED;
3196     }
3197    
3198 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3199    
3200     if (inescq && c != 0)
3201     {
3202 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3203 nigel 77 {
3204     inescq = FALSE;
3205     ptr++;
3206     continue;
3207     }
3208     else
3209     {
3210     if (previous_callout != NULL)
3211     {
3212 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3213     complete_callout(previous_callout, ptr, cd);
3214 nigel 77 previous_callout = NULL;
3215     }
3216     if ((options & PCRE_AUTO_CALLOUT) != 0)
3217     {
3218     previous_callout = code;
3219     code = auto_callout(code, ptr, cd);
3220     }
3221     goto NORMAL_CHAR;
3222     }
3223     }
3224    
3225     /* Fill in length of a previous callout, except when the next thing is
3226     a quantifier. */
3227    
3228 ph10 392 is_quantifier =
3229 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3230     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3231 nigel 77
3232     if (!is_quantifier && previous_callout != NULL &&
3233     after_manual_callout-- <= 0)
3234     {
3235 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3236     complete_callout(previous_callout, ptr, cd);
3237 nigel 77 previous_callout = NULL;
3238     }
3239    
3240     /* In extended mode, skip white space and comments */
3241    
3242     if ((options & PCRE_EXTENDED) != 0)
3243     {
3244     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3245 ph10 391 if (c == CHAR_NUMBER_SIGN)
3246 nigel 77 {
3247 ph10 579 ptr++;
3248 ph10 556 while (*ptr != 0)
3249 nigel 91 {
3250 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3251 ph10 556 ptr++;
3252 ph10 579 #ifdef SUPPORT_UTF8
3253 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3254     #endif
3255 nigel 91 }
3256 nigel 93 if (*ptr != 0) continue;
3257    
3258 nigel 91 /* Else fall through to handle end of string */
3259     c = 0;
3260 nigel 77 }
3261     }
3262    
3263     /* No auto callout for quantifiers. */
3264    
3265     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3266     {
3267     previous_callout = code;
3268     code = auto_callout(code, ptr, cd);
3269     }
3270    
3271     switch(c)
3272     {
3273 nigel 93 /* ===================================================================*/
3274     case 0: /* The branch terminates at string end */
3275 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3276     case CHAR_RIGHT_PARENTHESIS:
3277 nigel 77 *firstbyteptr = firstbyte;
3278     *reqbyteptr = reqbyte;
3279     *codeptr = code;
3280     *ptrptr = ptr;
3281 nigel 93 if (lengthptr != NULL)
3282     {
3283 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3284     {
3285     *errorcodeptr = ERR20;
3286     goto FAILED;
3287     }
3288 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3289 nigel 93 DPRINTF((">> end branch\n"));
3290     }
3291 nigel 77 return TRUE;
3292    
3293 nigel 93
3294     /* ===================================================================*/
3295 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3296     the setting of any following char as a first character. */
3297    
3298 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3299 ph10 602 previous = NULL;
3300 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3301     {
3302     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3303 ph10 602 *code++ = OP_CIRCM;
3304 nigel 77 }
3305 ph10 602 else *code++ = OP_CIRC;
3306 nigel 77 break;
3307    
3308 ph10 391 case CHAR_DOLLAR_SIGN:
3309 nigel 77 previous = NULL;
3310 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3311 nigel 77 break;
3312    
3313     /* There can never be a first char if '.' is first, whatever happens about
3314     repeats. The value of reqbyte doesn't change either. */
3315    
3316 ph10 391 case CHAR_DOT:
3317 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3318     zerofirstbyte = firstbyte;
3319     zeroreqbyte = reqbyte;
3320     previous = code;
3321 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3322 nigel 77 break;
3323    
3324 nigel 93
3325     /* ===================================================================*/
3326 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3327     32-byte bitmap of the permitted characters, except in the special case
3328     where there is only one such character. For negated classes, we build the
3329     map as usual, then invert it at the end. However, we use a different opcode
3330     so that data characters > 255 can be handled correctly.
3331 nigel 77
3332     If the class contains characters outside the 0-255 range, a different
3333     opcode is compiled. It may optionally have a bit map for characters < 256,
3334     but those above are are explicitly listed afterwards. A flag byte tells
3335     whether the bitmap is present, and whether this is a negated class or not.
3336 ph10 345
3337 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3338     default (Perl) mode, it is treated as a data character. */
3339 ph10 345
3340 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3341 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3342     {
3343     *errorcodeptr = ERR64;
3344 ph10 345 goto FAILED;
3345 ph10 336 }
3346 ph10 345 goto NORMAL_CHAR;
3347 nigel 77
3348 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3349 nigel 77 previous = code;
3350    
3351     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3352     they are encountered at the top level, so we'll do that too. */
3353    
3354 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3355 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3356 ph10 295 check_posix_syntax(ptr, &tempptr))
3357 nigel 77 {
3358 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3359 nigel 77 goto FAILED;
3360     }
3361    
3362 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3363 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3364 ph10 205 skip them too. This makes for compatibility with Perl. */
3365 ph10 208
3366 ph10 205 negate_class = FALSE;
3367     for (;;)
3368 nigel 77 {
3369     c = *(++ptr);
3370 ph10 391 if (c == CHAR_BACKSLASH)
3371 ph10 205 {
3372 ph10 392 if (ptr[1] == CHAR_E)
3373 ph10 391 ptr++;
3374 ph10 392 else if (strncmp((const char *)ptr+1,
3375     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3376 ph10 391 ptr += 3;
3377 ph10 392 else
3378 ph10 391 break;
3379 ph10 205 }
3380 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3381 ph10 205 negate_class = TRUE;
3382     else break;
3383 ph10 208 }
3384 ph10 345
3385     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3386     an initial ']' is taken as a data character -- the code below handles
3387 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3388     [^] must match any character, so generate OP_ALLANY. */
3389 ph10 345
3390 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3391 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3392 ph10 341 {
3393     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3394     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3395     zerofirstbyte = firstbyte;
3396     break;
3397 ph10 345 }
3398 nigel 77
3399 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3400     negation flag at the end, so that support for characters > 255 works
3401 ph10 264 correctly (they are all included in the class). */
3402    
3403     should_flip_negation = FALSE;
3404    
3405 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3406 nigel 93 of just a single character (as long as it's < 256). However, For higher
3407     valued UTF-8 characters, we don't yet do any optimization. */
3408 nigel 77
3409     class_charcount = 0;
3410     class_lastchar = -1;
3411    
3412 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3413     temporary bit of memory, in case the class contains only 1 character (less
3414     than 256), because in that case the compiled code doesn't use the bit map.
3415     */
3416    
3417     memset(classbits, 0, 32 * sizeof(uschar));
3418    
3419 nigel 77 #ifdef SUPPORT_UTF8
3420     class_utf8 = FALSE; /* No chars >= 256 */
3421 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3422 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3423 nigel 77 #endif
3424    
3425     /* Process characters until ] is reached. By writing this as a "do" it
3426 nigel 93 means that an initial ] is taken as a data character. At the start of the
3427     loop, c contains the first byte of the character. */
3428 nigel 77
3429 nigel 93 if (c != 0) do
3430 nigel 77 {
3431 nigel 93 const uschar *oldptr;
3432    
3433 nigel 77 #ifdef SUPPORT_UTF8
3434     if (utf8 && c > 127)
3435     { /* Braces are required because the */
3436     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3437     }
3438 ph10 535
3439 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3440 ph10 309 data and reset the pointer. This is so that very large classes that
3441 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3442 ph10 309 (which is on the stack). */
3443    
3444 ph10 300 if (lengthptr != NULL)
3445     {
3446     *lengthptr += class_utf8data - class_utf8data_base;
3447 ph10 309 class_utf8data = class_utf8data_base;
3448     }
3449    
3450 nigel 77 #endif
3451    
3452     /* Inside \Q...\E everything is literal except \E */
3453    
3454     if (inescq)
3455     {
3456 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3457 nigel 77 {
3458 nigel 93 inescq = FALSE; /* Reset literal state */
3459     ptr++; /* Skip the 'E' */
3460     continue; /* Carry on with next */
3461 nigel 77 }
3462 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3463 nigel 77 }
3464    
3465     /* Handle POSIX class names. Perl allows a negation extension of the
3466     form [:^name:]. A square bracket that doesn't match the syntax is
3467     treated as a literal. We also recognize the POSIX constructions
3468     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3469     5.6 and 5.8 do. */
3470    
3471 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3472 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3473 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3474 nigel 77 {
3475     BOOL local_negate = FALSE;
3476 nigel 87 int posix_class, taboffset, tabopt;
3477 nigel 77 register const uschar *cbits = cd->cbits;
3478 nigel 87 uschar pbits[32];
3479 nigel 77
3480 ph10 391 if (ptr[1] != CHAR_COLON)
3481 nigel 77 {
3482     *errorcodeptr = ERR31;
3483     goto FAILED;
3484     }
3485    
3486     ptr += 2;
3487 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3488 nigel 77 {
3489     local_negate = TRUE;
3490 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3491 nigel 77 ptr++;
3492     }
3493    
3494 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3495 nigel 77 if (posix_class < 0)
3496     {
3497     *errorcodeptr = ERR30;
3498     goto FAILED;
3499     }
3500    
3501     /* If matching is caseless, upper and lower are converted to
3502     alpha. This relies on the fact that the class table starts with
3503     alpha, lower, upper as the first 3 entries. */
3504    
3505     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3506     posix_class = 0;
3507 ph10 535
3508     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3509 ph10 518 different escape sequences that use Unicode properties. */
3510 ph10 535
3511 ph10 518 #ifdef SUPPORT_UCP
3512     if ((options & PCRE_UCP) != 0)
3513     {
3514     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3515     if (posix_substitutes[pc] != NULL)
3516     {
3517 ph10 535 nestptr = tempptr + 1;
3518 ph10 518 ptr = posix_substitutes[pc] - 1;
3519 ph10 535 continue;
3520     }
3521     }
3522     #endif
3523 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3524     chunk of local store because we may be adding and subtracting from it,
3525     and we don't want to subtract bits that may be in the main map already.
3526     At the end we or the result into the bit map that is being built. */
3527 nigel 77
3528     posix_class *= 3;
3529 nigel 87
3530     /* Copy in the first table (always present) */
3531    
3532     memcpy(pbits, cbits + posix_class_maps[posix_class],
3533     32 * sizeof(uschar));
3534    
3535     /* If there is a second table, add or remove it as required. */
3536    
3537     taboffset = posix_class_maps[posix_class + 1];
3538     tabopt = posix_class_maps[posix_class + 2];
3539    
3540     if (taboffset >= 0)
3541 nigel 77 {
3542 nigel 87 if (tabopt >= 0)
3543     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3544 nigel 77 else
3545 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3546 nigel 77 }
3547    
3548 nigel 87 /* Not see if we need to remove any special characters. An option
3549     value of 1 removes vertical space and 2 removes underscore. */
3550    
3551     if (tabopt < 0) tabopt = -tabopt;
3552     if (tabopt == 1) pbits[1] &= ~0x3c;
3553     else if (tabopt == 2) pbits[11] &= 0x7f;
3554    
3555     /* Add the POSIX table or its complement into the main table that is
3556     being built and we are done. */
3557    
3558     if (local_negate)
3559     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3560     else
3561     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3562    
3563 nigel 77 ptr = tempptr + 1;
3564     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3565     continue; /* End of POSIX syntax handling */
3566     }
3567    
3568     /* Backslash may introduce a single character, or it may introduce one
3569 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3570 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3571     assume that other escapes have more than one character in them, so set
3572     class_charcount bigger than one. Unrecognized escapes fall through and
3573     are either treated as literal characters (by default), or are faulted if
3574     PCRE_EXTRA is set. */
3575 nigel 77
3576 ph10 391 if (c == CHAR_BACKSLASH)
3577 nigel 77 {
3578 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3579     if (*errorcodeptr != 0) goto FAILED;
3580 nigel 77
3581 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3582 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3583     {
3584 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3585 nigel 77 {
3586     ptr += 2; /* avoid empty string */
3587     }
3588     else inescq = TRUE;
3589     continue;
3590     }
3591 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3592 nigel 77
3593     if (c < 0)
3594     {
3595     register const uschar *cbits = cd->cbits;
3596     class_charcount += 2; /* Greater than 1 is what matters */
3597 nigel 93
3598 ph10 518 switch (-c)
3599 nigel 77 {
3600 ph10 518 #ifdef SUPPORT_UCP
3601     case ESC_du: /* These are the values given for \d etc */
3602     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3603     case ESC_wu: /* escape sequence with an appropriate \p */
3604     case ESC_WU: /* or \P to test Unicode properties instead */
3605     case ESC_su: /* of the default ASCII testing. */
3606     case ESC_SU:
3607     nestptr = ptr;
3608     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3609 ph10 535 class_charcount -= 2; /* Undo! */
3610 ph10 518 continue;
3611     #endif
3612 nigel 77 case ESC_d:
3613     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3614     continue;
3615    
3616     case ESC_D:
3617 ph10 286 should_flip_negation = TRUE;
3618 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3619     continue;
3620    
3621     case ESC_w:
3622     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3623     continue;
3624    
3625     case ESC_W:
3626 ph10 286 should_flip_negation = TRUE;
3627 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3628     continue;
3629    
3630 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3631 ph10 579 if it was previously set by something earlier in the character
3632     class. */
3633 ph10 552
3634 nigel 77 case ESC_s:
3635 ph10 552 classbits[0] |= cbits[cbit_space];
3636 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3637 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3638 nigel 77 continue;
3639    
3640     case ESC_S:
3641 ph10 286 should_flip_negation = TRUE;
3642 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3643     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3644     continue;
3645    
3646 ph10 518 case ESC_h:
3647 ph10 178 SETBIT(classbits, 0x09); /* VT */
3648     SETBIT(classbits, 0x20); /* SPACE */
3649 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3650 ph10 178 #ifdef SUPPORT_UTF8
3651     if (utf8)
3652 ph10 180 {
3653 ph10 178 class_utf8 = TRUE;
3654     *class_utf8data++ = XCL_SINGLE;
3655 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3656 ph10 178 *class_utf8data++ = XCL_SINGLE;
3657 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3658     *class_utf8data++ = XCL_RANGE;
3659     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3660     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3661 ph10 178 *class_utf8data++ = XCL_SINGLE;
3662 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3663 ph10 178 *class_utf8data++ = XCL_SINGLE;
3664 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3665 ph10 178 *class_utf8data++ = XCL_SINGLE;
3666 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3667     }
3668     #endif
3669     continue;
3670 nigel 93
3671 ph10 518 case ESC_H:
3672 ph10 178 for (c = 0; c < 32; c++)
3673     {
3674     int x = 0xff;
3675     switch (c)
3676 ph10 180 {
3677 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3678     case 0x20/8: x ^= 1 << (0x20%8); break;
3679     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3680     default: break;
3681     }
3682     classbits[c] |= x;
3683 ph10 180 }
3684    
3685 ph10 178 #ifdef SUPPORT_UTF8
3686     if (utf8)
3687 ph10 180 {
3688 ph10 178 class_utf8 = TRUE;
3689 ph10 180 *class_utf8data++ = XCL_RANGE;
3690     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3691     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3692     *class_utf8data++ = XCL_RANGE;
3693     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3694     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3695     *class_utf8data++ = XCL_RANGE;
3696     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3697     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3698     *class_utf8data++ = XCL_RANGE;
3699     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3700     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3701     *class_utf8data++ = XCL_RANGE;
3702     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3703     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3704     *class_utf8data++ = XCL_RANGE;
3705     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3706     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3707     *class_utf8data++ = XCL_RANGE;
3708     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3709     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3710     }
3711     #endif
3712     continue;
3713 ph10 178
3714 ph10 518 case ESC_v:
3715 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3716     SETBIT(classbits, 0x0b); /* VT */
3717 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3718     SETBIT(classbits, 0x0d); /* CR */
3719     SETBIT(classbits, 0x85); /* NEL */
3720 ph10 178 #ifdef SUPPORT_UTF8
3721     if (utf8)
3722 ph10 180 {
3723 ph10 178 class_utf8 = TRUE;
3724 ph10 180 *class_utf8data++ = XCL_RANGE;
3725     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3726     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3727     }
3728     #endif
3729     continue;
3730 ph10 178
3731 ph10 518 case ESC_V:
3732 ph10 178