/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 213 - (hide annotations) (download)
Wed Aug 15 11:34:14 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 187479 byte(s)
Add integer overflow tests to escape processing.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285 ph10 213 "(*VERB) not recognized",
286     "number is too big"
287 nigel 77 };
288    
289    
290     /* Table to identify digits and hex digits. This is used when compiling
291     patterns. Note that the tables in chartables are dependent on the locale, and
292     may mark arbitrary characters as digits - but the PCRE compiling code expects
293     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294     a private table here. It costs 256 bytes, but it is a lot faster than doing
295     character value tests (at least in some simple cases I timed), and in some
296     applications one wants PCRE to compile efficiently as well as match
297     efficiently.
298    
299     For convenience, we use the same bit definitions as in chartables:
300    
301     0x04 decimal digit
302     0x08 hexadecimal digit
303    
304     Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 nigel 77 static const unsigned char digitab[] =
308     {
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
343 nigel 77 static const unsigned char digitab[] =
344     {
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377    
378     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411     #endif
412    
413    
414     /* Definition to allow mutual recursion */
415    
416     static BOOL
417 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
419 nigel 77
420    
421    
422     /*************************************************
423     * Handle escapes *
424     *************************************************/
425    
426     /* This function is called when a \ has been encountered. It either returns a
427     positive value for a simple escape such as \n, or a negative value which
428 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
429     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431     ptr is pointing at the \. On exit, it is on the final character of the escape
432     sequence.
433 nigel 77
434     Arguments:
435     ptrptr points to the pattern position pointer
436     errorcodeptr points to the errorcode variable
437     bracount number of previous extracting brackets
438     options the options bits
439     isclass TRUE if inside a character class
440    
441     Returns: zero or positive => a data character
442     negative => a special escape sequence
443 ph10 213 on error, errorcodeptr is set
444 nigel 77 */
445    
446     static int
447     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448     int options, BOOL isclass)
449     {
450 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
451     const uschar *ptr = *ptrptr + 1;
452 nigel 77 int c, i;
453    
454 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455     ptr--; /* Set pointer back to the last byte */
456    
457 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
458    
459     if (c == 0) *errorcodeptr = ERR1;
460    
461     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462     a table. A non-zero result is something that can be returned immediately.
463     Otherwise further processing may be required. */
464    
465 ph10 97 #ifndef EBCDIC /* ASCII coding */
466 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
467     else if ((i = escapes[c - '0']) != 0) c = i;
468    
469 ph10 97 #else /* EBCDIC coding */
470 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471     else if ((i = escapes[c - 0x48]) != 0) c = i;
472     #endif
473    
474     /* Escapes that need further processing, or are illegal. */
475    
476     else
477     {
478     const uschar *oldptr;
479 nigel 93 BOOL braced, negated;
480    
481 nigel 77 switch (c)
482     {
483     /* A number of Perl escapes are not handled by PCRE. We give an explicit
484     error. */
485    
486     case 'l':
487     case 'L':
488     case 'N':
489     case 'u':
490     case 'U':
491     *errorcodeptr = ERR37;
492     break;
493    
494 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
495     is an absolute backreference. If negative, it is a relative backreference.
496 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497     reference to a named group. This is part of Perl's movement towards a
498     unified syntax for back references. As this is synonymous with \k{name}, we
499 ph10 171 fudge it up by pretending it really was \k. */
500 nigel 93
501     case 'g':
502     if (ptr[1] == '{')
503     {
504 ph10 171 const uschar *p;
505     for (p = ptr+2; *p != 0 && *p != '}'; p++)
506     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 ph10 172 if (*p != 0 && *p != '}')
508 ph10 171 {
509     c = -ESC_k;
510     break;
511 ph10 172 }
512 nigel 93 braced = TRUE;
513     ptr++;
514     }
515     else braced = FALSE;
516    
517     if (ptr[1] == '-')
518     {
519     negated = TRUE;
520     ptr++;
521     }
522     else negated = FALSE;
523    
524     c = 0;
525     while ((digitab[ptr[1]] & ctype_digit) != 0)
526     c = c * 10 + *(++ptr) - '0';
527 ph10 213
528     if (c < 0)
529     {
530     *errorcodeptr = ERR61;
531     break;
532     }
533 nigel 93
534     if (c == 0 || (braced && *(++ptr) != '}'))
535     {
536     *errorcodeptr = ERR57;
537 ph10 213 break;
538 nigel 93 }
539    
540     if (negated)
541     {
542     if (c > bracount)
543     {
544     *errorcodeptr = ERR15;
545 ph10 213 break;
546 nigel 93 }
547     c = bracount - (c - 1);
548     }
549    
550     c = -(ESC_REF + c);
551     break;
552    
553 nigel 77 /* The handling of escape sequences consisting of a string of digits
554     starting with one that is not zero is not straightforward. By experiment,
555     the way Perl works seems to be as follows:
556    
557     Outside a character class, the digits are read as a decimal number. If the
558     number is less than 10, or if there are that many previous extracting
559     left brackets, then it is a back reference. Otherwise, up to three octal
560     digits are read to form an escaped byte. Thus \123 is likely to be octal
561     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562     value is greater than 377, the least significant 8 bits are taken. Inside a
563     character class, \ followed by a digit is always an octal number. */
564    
565     case '1': case '2': case '3': case '4': case '5':
566     case '6': case '7': case '8': case '9':
567    
568     if (!isclass)
569     {
570     oldptr = ptr;
571     c -= '0';
572     while ((digitab[ptr[1]] & ctype_digit) != 0)
573     c = c * 10 + *(++ptr) - '0';
574 ph10 213 if (c < 0)
575     {
576     *errorcodeptr = ERR61;
577     break;
578     }
579 nigel 77 if (c < 10 || c <= bracount)
580     {
581     c = -(ESC_REF + c);
582     break;
583     }
584     ptr = oldptr; /* Put the pointer back and fall through */
585     }
586    
587     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588     generates a binary zero byte and treats the digit as a following literal.
589     Thus we have to pull back the pointer by one. */
590    
591     if ((c = *ptr) >= '8')
592     {
593     ptr--;
594     c = 0;
595     break;
596     }
597    
598     /* \0 always starts an octal number, but we may drop through to here with a
599 nigel 91 larger first octal digit. The original code used just to take the least
600     significant 8 bits of octal numbers (I think this is what early Perls used
601     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602     than 3 octal digits. */
603 nigel 77
604     case '0':
605     c -= '0';
606     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607     c = c * 8 + *(++ptr) - '0';
608 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 nigel 77 break;
610    
611 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
612     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613     treated as a data character. */
614 nigel 77
615     case 'x':
616 nigel 87 if (ptr[1] == '{')
617 nigel 77 {
618     const uschar *pt = ptr + 2;
619 nigel 87 int count = 0;
620    
621 nigel 77 c = 0;
622     while ((digitab[*pt] & ctype_xdigit) != 0)
623     {
624 nigel 87 register int cc = *pt++;
625     if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 nigel 77 count++;
627 nigel 87
628 ph10 97 #ifndef EBCDIC /* ASCII coding */
629 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 ph10 97 #else /* EBCDIC coding */
632 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 nigel 77 #endif
635     }
636 nigel 87
637 nigel 77 if (*pt == '}')
638     {
639 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 nigel 77 ptr = pt;
641     break;
642     }
643 nigel 87
644 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
645     recognize this construct; fall through to the normal \x handling. */
646     }
647    
648 nigel 87 /* Read just a single-byte hex-defined char */
649 nigel 77
650     c = 0;
651     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652     {
653     int cc; /* Some compilers don't like ++ */
654     cc = *(++ptr); /* in initializers */
655 ph10 97 #ifndef EBCDIC /* ASCII coding */
656 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 ph10 97 #else /* EBCDIC coding */
659 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
660     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661     #endif
662     }
663     break;
664    
665 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666     This coding is ASCII-specific, but then the whole concept of \cx is
667     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668 nigel 77
669     case 'c':
670     c = *(++ptr);
671     if (c == 0)
672     {
673     *errorcodeptr = ERR2;
674 ph10 213 break;
675 nigel 77 }
676    
677 ph10 97 #ifndef EBCDIC /* ASCII coding */
678 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
679     c ^= 0x40;
680 ph10 97 #else /* EBCDIC coding */
681 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
682     c ^= 0xC0;
683     #endif
684     break;
685    
686     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688     for Perl compatibility, it is a literal. This code looks a bit odd, but
689     there used to be some cases other than the default, and there may be again
690     in future, so I haven't "optimized" it. */
691    
692     default:
693     if ((options & PCRE_EXTRA) != 0) switch(c)
694     {
695     default:
696     *errorcodeptr = ERR3;
697     break;
698     }
699     break;
700     }
701     }
702    
703     *ptrptr = ptr;
704     return c;
705     }
706    
707    
708    
709     #ifdef SUPPORT_UCP
710     /*************************************************
711     * Handle \P and \p *
712     *************************************************/
713    
714     /* This function is called after \P or \p has been encountered, provided that
715     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716     pointing at the P or p. On exit, it is pointing at the final character of the
717     escape sequence.
718    
719     Argument:
720     ptrptr points to the pattern position pointer
721     negptr points to a boolean that is set TRUE for negation else FALSE
722 nigel 87 dptr points to an int that is set to the detailed property value
723 nigel 77 errorcodeptr points to the error code variable
724    
725 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
726 nigel 77 */
727    
728     static int
729 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 nigel 77 {
731     int c, i, bot, top;
732     const uschar *ptr = *ptrptr;
733 nigel 87 char name[32];
734 nigel 77
735     c = *(++ptr);
736     if (c == 0) goto ERROR_RETURN;
737    
738     *negptr = FALSE;
739    
740 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741     negation. */
742 nigel 77
743     if (c == '{')
744     {
745     if (ptr[1] == '^')
746     {
747     *negptr = TRUE;
748     ptr++;
749     }
750 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 nigel 77 {
752     c = *(++ptr);
753     if (c == 0) goto ERROR_RETURN;
754     if (c == '}') break;
755     name[i] = c;
756     }
757 nigel 87 if (c !='}') goto ERROR_RETURN;
758 nigel 77 name[i] = 0;
759     }
760    
761     /* Otherwise there is just one following character */
762    
763     else
764     {
765     name[0] = c;
766     name[1] = 0;
767     }
768    
769     *ptrptr = ptr;
770    
771     /* Search for a recognized property name using binary chop */
772    
773     bot = 0;
774     top = _pcre_utt_size;
775    
776     while (bot < top)
777     {
778 nigel 87 i = (bot + top) >> 1;
779 nigel 77 c = strcmp(name, _pcre_utt[i].name);
780 nigel 87 if (c == 0)
781     {
782     *dptr = _pcre_utt[i].value;
783     return _pcre_utt[i].type;
784     }
785 nigel 77 if (c > 0) bot = i + 1; else top = i;
786     }
787    
788     *errorcodeptr = ERR47;
789     *ptrptr = ptr;
790     return -1;
791    
792     ERROR_RETURN:
793     *errorcodeptr = ERR46;
794     *ptrptr = ptr;
795     return -1;
796     }
797     #endif
798    
799    
800    
801    
802     /*************************************************
803     * Check for counted repeat *
804     *************************************************/
805    
806     /* This function is called when a '{' is encountered in a place where it might
807     start a quantifier. It looks ahead to see if it really is a quantifier or not.
808     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809     where the ddds are digits.
810    
811     Arguments:
812     p pointer to the first char after '{'
813    
814     Returns: TRUE or FALSE
815     */
816    
817     static BOOL
818     is_counted_repeat(const uschar *p)
819     {
820     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821     while ((digitab[*p] & ctype_digit) != 0) p++;
822     if (*p == '}') return TRUE;
823    
824     if (*p++ != ',') return FALSE;
825     if (*p == '}') return TRUE;
826    
827     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828     while ((digitab[*p] & ctype_digit) != 0) p++;
829    
830     return (*p == '}');
831     }
832    
833    
834    
835     /*************************************************
836     * Read repeat counts *
837     *************************************************/
838    
839     /* Read an item of the form {n,m} and return the values. This is called only
840     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841     so the syntax is guaranteed to be correct, but we need to check the values.
842    
843     Arguments:
844     p pointer to first char after '{'
845     minp pointer to int for min
846     maxp pointer to int for max
847     returned as -1 if no max
848     errorcodeptr points to error code variable
849    
850     Returns: pointer to '}' on success;
851     current ptr on error, with errorcodeptr set non-zero
852     */
853    
854     static const uschar *
855     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856     {
857     int min = 0;
858     int max = -1;
859    
860 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
861     an integer overflow. */
862    
863 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 nigel 81 if (min < 0 || min > 65535)
865     {
866     *errorcodeptr = ERR5;
867     return p;
868     }
869 nigel 77
870 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
871     Also, max must not be less than min. */
872    
873 nigel 77 if (*p == '}') max = min; else
874     {
875     if (*(++p) != '}')
876     {
877     max = 0;
878     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 nigel 81 if (max < 0 || max > 65535)
880     {
881     *errorcodeptr = ERR5;
882     return p;
883     }
884 nigel 77 if (max < min)
885     {
886     *errorcodeptr = ERR4;
887     return p;
888     }
889     }
890     }
891    
892 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
893     '}'. */
894 nigel 77
895 nigel 81 *minp = min;
896     *maxp = max;
897 nigel 77 return p;
898     }
899    
900    
901    
902     /*************************************************
903 nigel 93 * Find forward referenced subpattern *
904 nigel 91 *************************************************/
905    
906 nigel 93 /* This function scans along a pattern's text looking for capturing
907     subpatterns, and counting them. If it finds a named pattern that matches the
908     name it is given, it returns its number. Alternatively, if the name is NULL, it
909     returns when it reaches a given numbered subpattern. This is used for forward
910     references to subpatterns. We know that if (?P< is encountered, the name will
911     be terminated by '>' because that is checked in the first pass.
912 nigel 91
913     Arguments:
914 nigel 93 ptr current position in the pattern
915     count current count of capturing parens so far encountered
916     name name to seek, or NULL if seeking a numbered subpattern
917     lorn name length, or subpattern number if name is NULL
918     xmode TRUE if we are in /x mode
919 nigel 91
920     Returns: the number of the named subpattern, or -1 if not found
921     */
922    
923     static int
924 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925     BOOL xmode)
926 nigel 91 {
927     const uschar *thisname;
928 nigel 93
929 nigel 91 for (; *ptr != 0; ptr++)
930     {
931 nigel 93 int term;
932    
933     /* Skip over backslashed characters and also entire \Q...\E */
934    
935     if (*ptr == '\\')
936     {
937     if (*(++ptr) == 0) return -1;
938     if (*ptr == 'Q') for (;;)
939     {
940     while (*(++ptr) != 0 && *ptr != '\\');
941     if (*ptr == 0) return -1;
942     if (*(++ptr) == 'E') break;
943     }
944     continue;
945     }
946    
947     /* Skip over character classes */
948    
949     if (*ptr == '[')
950     {
951     while (*(++ptr) != ']')
952     {
953     if (*ptr == '\\')
954     {
955     if (*(++ptr) == 0) return -1;
956     if (*ptr == 'Q') for (;;)
957     {
958     while (*(++ptr) != 0 && *ptr != '\\');
959     if (*ptr == 0) return -1;
960     if (*(++ptr) == 'E') break;
961     }
962     continue;
963     }
964     }
965     continue;
966     }
967    
968     /* Skip comments in /x mode */
969    
970     if (xmode && *ptr == '#')
971     {
972     while (*(++ptr) != 0 && *ptr != '\n');
973     if (*ptr == 0) return -1;
974     continue;
975     }
976    
977     /* An opening parens must now be a real metacharacter */
978    
979 nigel 91 if (*ptr != '(') continue;
980 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
981 nigel 93 {
982     count++;
983     if (name == NULL && count == lorn) return count;
984     continue;
985     }
986    
987     ptr += 2;
988     if (*ptr == 'P') ptr++; /* Allow optional P */
989    
990     /* We have to disambiguate (?<! and (?<= from (?<name> */
991    
992     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
993     *ptr != '\'')
994     continue;
995    
996 nigel 91 count++;
997 nigel 93
998     if (name == NULL && count == lorn) return count;
999     term = *ptr++;
1000     if (term == '<') term = '>';
1001 nigel 91 thisname = ptr;
1002 nigel 93 while (*ptr != term) ptr++;
1003     if (name != NULL && lorn == ptr - thisname &&
1004     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1005 nigel 91 return count;
1006     }
1007 nigel 93
1008 nigel 91 return -1;
1009     }
1010    
1011    
1012    
1013     /*************************************************
1014 nigel 77 * Find first significant op code *
1015     *************************************************/
1016    
1017     /* This is called by several functions that scan a compiled expression looking
1018     for a fixed first character, or an anchoring op code etc. It skips over things
1019     that do not influence this. For some calls, a change of option is important.
1020     For some calls, it makes sense to skip negative forward and all backward
1021     assertions, and also the \b assertion; for others it does not.
1022    
1023     Arguments:
1024     code pointer to the start of the group
1025     options pointer to external options
1026     optbit the option bit whose changing is significant, or
1027     zero if none are
1028     skipassert TRUE if certain assertions are to be skipped
1029    
1030     Returns: pointer to the first significant opcode
1031     */
1032    
1033     static const uschar*
1034     first_significant_code(const uschar *code, int *options, int optbit,
1035     BOOL skipassert)
1036     {
1037     for (;;)
1038     {
1039     switch ((int)*code)
1040     {
1041     case OP_OPT:
1042     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1043     *options = (int)code[1];
1044     code += 2;
1045     break;
1046    
1047     case OP_ASSERT_NOT:
1048     case OP_ASSERTBACK:
1049     case OP_ASSERTBACK_NOT:
1050     if (!skipassert) return code;
1051     do code += GET(code, 1); while (*code == OP_ALT);
1052     code += _pcre_OP_lengths[*code];
1053     break;
1054    
1055     case OP_WORD_BOUNDARY:
1056     case OP_NOT_WORD_BOUNDARY:
1057     if (!skipassert) return code;
1058     /* Fall through */
1059    
1060     case OP_CALLOUT:
1061     case OP_CREF:
1062 nigel 93 case OP_RREF:
1063     case OP_DEF:
1064 nigel 77 code += _pcre_OP_lengths[*code];
1065     break;
1066    
1067     default:
1068     return code;
1069     }
1070     }
1071     /* Control never reaches here */
1072     }
1073    
1074    
1075    
1076    
1077     /*************************************************
1078     * Find the fixed length of a pattern *
1079     *************************************************/
1080    
1081     /* Scan a pattern and compute the fixed length of subject that will match it,
1082     if the length is fixed. This is needed for dealing with backward assertions.
1083     In UTF8 mode, the result is in characters rather than bytes.
1084    
1085     Arguments:
1086     code points to the start of the pattern (the bracket)
1087     options the compiling options
1088    
1089     Returns: the fixed length, or -1 if there is no fixed length,
1090     or -2 if \C was encountered
1091     */
1092    
1093     static int
1094     find_fixedlength(uschar *code, int options)
1095     {
1096     int length = -1;
1097    
1098     register int branchlength = 0;
1099     register uschar *cc = code + 1 + LINK_SIZE;
1100    
1101     /* Scan along the opcodes for this branch. If we get to the end of the
1102     branch, check the length against that of the other branches. */
1103    
1104     for (;;)
1105     {
1106     int d;
1107     register int op = *cc;
1108    
1109     switch (op)
1110     {
1111 nigel 93 case OP_CBRA:
1112 nigel 77 case OP_BRA:
1113     case OP_ONCE:
1114     case OP_COND:
1115 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 nigel 77 if (d < 0) return d;
1117     branchlength += d;
1118     do cc += GET(cc, 1); while (*cc == OP_ALT);
1119     cc += 1 + LINK_SIZE;
1120     break;
1121    
1122     /* Reached end of a branch; if it's a ket it is the end of a nested
1123     call. If it's ALT it is an alternation in a nested call. If it is
1124     END it's the end of the outer call. All can be handled by the same code. */
1125    
1126     case OP_ALT:
1127     case OP_KET:
1128     case OP_KETRMAX:
1129     case OP_KETRMIN:
1130     case OP_END:
1131     if (length < 0) length = branchlength;
1132     else if (length != branchlength) return -1;
1133     if (*cc != OP_ALT) return length;
1134     cc += 1 + LINK_SIZE;
1135     branchlength = 0;
1136     break;
1137    
1138     /* Skip over assertive subpatterns */
1139    
1140     case OP_ASSERT:
1141     case OP_ASSERT_NOT:
1142     case OP_ASSERTBACK:
1143     case OP_ASSERTBACK_NOT:
1144     do cc += GET(cc, 1); while (*cc == OP_ALT);
1145     /* Fall through */
1146    
1147     /* Skip over things that don't match chars */
1148    
1149     case OP_REVERSE:
1150     case OP_CREF:
1151 nigel 93 case OP_RREF:
1152     case OP_DEF:
1153 nigel 77 case OP_OPT:
1154     case OP_CALLOUT:
1155     case OP_SOD:
1156     case OP_SOM:
1157     case OP_EOD:
1158     case OP_EODN:
1159     case OP_CIRC:
1160     case OP_DOLL:
1161     case OP_NOT_WORD_BOUNDARY:
1162     case OP_WORD_BOUNDARY:
1163     cc += _pcre_OP_lengths[*cc];
1164     break;
1165    
1166     /* Handle literal characters */
1167    
1168     case OP_CHAR:
1169     case OP_CHARNC:
1170 nigel 91 case OP_NOT:
1171 nigel 77 branchlength++;
1172     cc += 2;
1173     #ifdef SUPPORT_UTF8
1174     if ((options & PCRE_UTF8) != 0)
1175     {
1176     while ((*cc & 0xc0) == 0x80) cc++;
1177     }
1178     #endif
1179     break;
1180    
1181     /* Handle exact repetitions. The count is already in characters, but we
1182     need to skip over a multibyte character in UTF8 mode. */
1183    
1184     case OP_EXACT:
1185     branchlength += GET2(cc,1);
1186     cc += 4;
1187     #ifdef SUPPORT_UTF8
1188     if ((options & PCRE_UTF8) != 0)
1189     {
1190     while((*cc & 0x80) == 0x80) cc++;
1191     }
1192     #endif
1193     break;
1194    
1195     case OP_TYPEEXACT:
1196     branchlength += GET2(cc,1);
1197     cc += 4;
1198     break;
1199    
1200     /* Handle single-char matchers */
1201    
1202     case OP_PROP:
1203     case OP_NOTPROP:
1204 nigel 87 cc += 2;
1205 nigel 77 /* Fall through */
1206    
1207     case OP_NOT_DIGIT:
1208     case OP_DIGIT:
1209     case OP_NOT_WHITESPACE:
1210     case OP_WHITESPACE:
1211     case OP_NOT_WORDCHAR:
1212     case OP_WORDCHAR:
1213     case OP_ANY:
1214     branchlength++;
1215     cc++;
1216     break;
1217    
1218     /* The single-byte matcher isn't allowed */
1219    
1220     case OP_ANYBYTE:
1221     return -2;
1222    
1223     /* Check a class for variable quantification */
1224    
1225     #ifdef SUPPORT_UTF8
1226     case OP_XCLASS:
1227     cc += GET(cc, 1) - 33;
1228     /* Fall through */
1229     #endif
1230    
1231     case OP_CLASS:
1232     case OP_NCLASS:
1233     cc += 33;
1234    
1235     switch (*cc)
1236     {
1237     case OP_CRSTAR:
1238     case OP_CRMINSTAR:
1239     case OP_CRQUERY:
1240     case OP_CRMINQUERY:
1241     return -1;
1242    
1243     case OP_CRRANGE:
1244     case OP_CRMINRANGE:
1245     if (GET2(cc,1) != GET2(cc,3)) return -1;
1246     branchlength += GET2(cc,1);
1247     cc += 5;
1248     break;
1249    
1250     default:
1251     branchlength++;
1252     }
1253     break;
1254    
1255     /* Anything else is variable length */
1256    
1257     default:
1258     return -1;
1259     }
1260     }
1261     /* Control never gets here */
1262     }
1263    
1264    
1265    
1266    
1267     /*************************************************
1268     * Scan compiled regex for numbered bracket *
1269     *************************************************/
1270    
1271     /* This little function scans through a compiled pattern until it finds a
1272     capturing bracket with the given number.
1273    
1274     Arguments:
1275     code points to start of expression
1276     utf8 TRUE in UTF-8 mode
1277     number the required bracket number
1278    
1279     Returns: pointer to the opcode for the bracket, or NULL if not found
1280     */
1281    
1282     static const uschar *
1283     find_bracket(const uschar *code, BOOL utf8, int number)
1284     {
1285     for (;;)
1286     {
1287     register int c = *code;
1288     if (c == OP_END) return NULL;
1289 nigel 91
1290     /* XCLASS is used for classes that cannot be represented just by a bit
1291     map. This includes negated single high-valued characters. The length in
1292     the table is zero; the actual length is stored in the compiled code. */
1293    
1294     if (c == OP_XCLASS) code += GET(code, 1);
1295    
1296 nigel 93 /* Handle capturing bracket */
1297 nigel 91
1298 nigel 93 else if (c == OP_CBRA)
1299 nigel 77 {
1300 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1301 nigel 77 if (n == number) return (uschar *)code;
1302 nigel 93 code += _pcre_OP_lengths[c];
1303 nigel 77 }
1304 nigel 91
1305 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1306     a multi-byte character. The length in the table is a minimum, so we have to
1307     arrange to skip the extra bytes. */
1308 nigel 91
1309 nigel 77 else
1310     {
1311     code += _pcre_OP_lengths[c];
1312 ph10 107 #ifdef SUPPORT_UTF8
1313 nigel 77 if (utf8) switch(c)
1314     {
1315     case OP_CHAR:
1316     case OP_CHARNC:
1317     case OP_EXACT:
1318     case OP_UPTO:
1319     case OP_MINUPTO:
1320 nigel 93 case OP_POSUPTO:
1321 nigel 77 case OP_STAR:
1322     case OP_MINSTAR:
1323 nigel 93 case OP_POSSTAR:
1324 nigel 77 case OP_PLUS:
1325     case OP_MINPLUS:
1326 nigel 93 case OP_POSPLUS:
1327 nigel 77 case OP_QUERY:
1328     case OP_MINQUERY:
1329 nigel 93 case OP_POSQUERY:
1330     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1331 nigel 77 break;
1332     }
1333 ph10 111 #endif
1334 nigel 77 }
1335     }
1336     }
1337    
1338    
1339    
1340     /*************************************************
1341     * Scan compiled regex for recursion reference *
1342     *************************************************/
1343    
1344     /* This little function scans through a compiled pattern until it finds an
1345     instance of OP_RECURSE.
1346    
1347     Arguments:
1348     code points to start of expression
1349     utf8 TRUE in UTF-8 mode
1350    
1351     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1352     */
1353    
1354     static const uschar *
1355     find_recurse(const uschar *code, BOOL utf8)
1356     {
1357     for (;;)
1358     {
1359     register int c = *code;
1360     if (c == OP_END) return NULL;
1361 nigel 91 if (c == OP_RECURSE) return code;
1362    
1363     /* XCLASS is used for classes that cannot be represented just by a bit
1364     map. This includes negated single high-valued characters. The length in
1365     the table is zero; the actual length is stored in the compiled code. */
1366    
1367     if (c == OP_XCLASS) code += GET(code, 1);
1368    
1369     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1370     that are followed by a character may be followed by a multi-byte character.
1371 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1372     bytes. */
1373 nigel 91
1374 nigel 77 else
1375     {
1376     code += _pcre_OP_lengths[c];
1377 ph10 107 #ifdef SUPPORT_UTF8
1378 nigel 77 if (utf8) switch(c)
1379     {
1380     case OP_CHAR:
1381     case OP_CHARNC:
1382     case OP_EXACT:
1383     case OP_UPTO:
1384     case OP_MINUPTO:
1385 nigel 93 case OP_POSUPTO:
1386 nigel 77 case OP_STAR:
1387     case OP_MINSTAR:
1388 nigel 93 case OP_POSSTAR:
1389 nigel 77 case OP_PLUS:
1390     case OP_MINPLUS:
1391 nigel 93 case OP_POSPLUS:
1392 nigel 77 case OP_QUERY:
1393     case OP_MINQUERY:
1394 nigel 93 case OP_POSQUERY:
1395     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1396 nigel 77 break;
1397     }
1398 ph10 111 #endif
1399 nigel 77 }
1400     }
1401     }
1402    
1403    
1404    
1405     /*************************************************
1406     * Scan compiled branch for non-emptiness *
1407     *************************************************/
1408    
1409     /* This function scans through a branch of a compiled pattern to see whether it
1410 nigel 93 can match the empty string or not. It is called from could_be_empty()
1411     below and from compile_branch() when checking for an unlimited repeat of a
1412     group that can match nothing. Note that first_significant_code() skips over
1413     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1414     struck an inner bracket whose current branch will already have been scanned.
1415 nigel 77
1416     Arguments:
1417     code points to start of search
1418     endcode points to where to stop
1419     utf8 TRUE if in UTF8 mode
1420    
1421     Returns: TRUE if what is matched could be empty
1422     */
1423    
1424     static BOOL
1425     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1426     {
1427     register int c;
1428 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1429 nigel 77 code < endcode;
1430     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1431     {
1432     const uschar *ccode;
1433    
1434     c = *code;
1435 ph10 172
1436 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1437 nigel 77
1438 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1439     {
1440 ph10 172 code += _pcre_OP_lengths[c];
1441 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1442     c = *code;
1443     continue;
1444     }
1445    
1446     /* For other groups, scan the branches. */
1447 ph10 172
1448 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1449 nigel 77 {
1450     BOOL empty_branch;
1451     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1452    
1453     /* Scan a closed bracket */
1454    
1455     empty_branch = FALSE;
1456     do
1457     {
1458     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1459     empty_branch = TRUE;
1460     code += GET(code, 1);
1461     }
1462     while (*code == OP_ALT);
1463     if (!empty_branch) return FALSE; /* All branches are non-empty */
1464 ph10 172 c = *code;
1465 nigel 93 continue;
1466 nigel 77 }
1467    
1468 nigel 93 /* Handle the other opcodes */
1469    
1470     switch (c)
1471 nigel 77 {
1472     /* Check for quantifiers after a class */
1473    
1474     #ifdef SUPPORT_UTF8
1475     case OP_XCLASS:
1476     ccode = code + GET(code, 1);
1477     goto CHECK_CLASS_REPEAT;
1478     #endif
1479    
1480     case OP_CLASS:
1481     case OP_NCLASS:
1482     ccode = code + 33;
1483    
1484     #ifdef SUPPORT_UTF8
1485     CHECK_CLASS_REPEAT:
1486     #endif
1487    
1488     switch (*ccode)
1489     {
1490     case OP_CRSTAR: /* These could be empty; continue */
1491     case OP_CRMINSTAR:
1492     case OP_CRQUERY:
1493     case OP_CRMINQUERY:
1494     break;
1495    
1496     default: /* Non-repeat => class must match */
1497     case OP_CRPLUS: /* These repeats aren't empty */
1498     case OP_CRMINPLUS:
1499     return FALSE;
1500    
1501     case OP_CRRANGE:
1502     case OP_CRMINRANGE:
1503     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1504     break;
1505     }
1506     break;
1507    
1508     /* Opcodes that must match a character */
1509    
1510     case OP_PROP:
1511     case OP_NOTPROP:
1512     case OP_EXTUNI:
1513     case OP_NOT_DIGIT:
1514     case OP_DIGIT:
1515     case OP_NOT_WHITESPACE:
1516     case OP_WHITESPACE:
1517     case OP_NOT_WORDCHAR:
1518     case OP_WORDCHAR:
1519     case OP_ANY:
1520     case OP_ANYBYTE:
1521     case OP_CHAR:
1522     case OP_CHARNC:
1523     case OP_NOT:
1524     case OP_PLUS:
1525     case OP_MINPLUS:
1526 nigel 93 case OP_POSPLUS:
1527 nigel 77 case OP_EXACT:
1528     case OP_NOTPLUS:
1529     case OP_NOTMINPLUS:
1530 nigel 93 case OP_NOTPOSPLUS:
1531 nigel 77 case OP_NOTEXACT:
1532     case OP_TYPEPLUS:
1533     case OP_TYPEMINPLUS:
1534 nigel 93 case OP_TYPEPOSPLUS:
1535 nigel 77 case OP_TYPEEXACT:
1536     return FALSE;
1537    
1538     /* End of branch */
1539    
1540     case OP_KET:
1541     case OP_KETRMAX:
1542     case OP_KETRMIN:
1543     case OP_ALT:
1544     return TRUE;
1545    
1546 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1547     MINUPTO, and POSUPTO may be followed by a multibyte character */
1548 nigel 77
1549     #ifdef SUPPORT_UTF8
1550     case OP_STAR:
1551     case OP_MINSTAR:
1552 nigel 93 case OP_POSSTAR:
1553 nigel 77 case OP_QUERY:
1554     case OP_MINQUERY:
1555 nigel 93 case OP_POSQUERY:
1556 nigel 77 case OP_UPTO:
1557     case OP_MINUPTO:
1558 nigel 93 case OP_POSUPTO:
1559 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1560     break;
1561     #endif
1562     }
1563     }
1564    
1565     return TRUE;
1566     }
1567    
1568    
1569    
1570     /*************************************************
1571     * Scan compiled regex for non-emptiness *
1572     *************************************************/
1573    
1574     /* This function is called to check for left recursive calls. We want to check
1575     the current branch of the current pattern to see if it could match the empty
1576     string. If it could, we must look outwards for branches at other levels,
1577     stopping when we pass beyond the bracket which is the subject of the recursion.
1578    
1579     Arguments:
1580     code points to start of the recursion
1581     endcode points to where to stop (current RECURSE item)
1582     bcptr points to the chain of current (unclosed) branch starts
1583     utf8 TRUE if in UTF-8 mode
1584    
1585     Returns: TRUE if what is matched could be empty
1586     */
1587    
1588     static BOOL
1589     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1590     BOOL utf8)
1591     {
1592     while (bcptr != NULL && bcptr->current >= code)
1593     {
1594     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1595     bcptr = bcptr->outer;
1596     }
1597     return TRUE;
1598     }
1599    
1600    
1601    
1602     /*************************************************
1603     * Check for POSIX class syntax *
1604     *************************************************/
1605    
1606     /* This function is called when the sequence "[:" or "[." or "[=" is
1607     encountered in a character class. It checks whether this is followed by an
1608     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1609     ".]" or "=]".
1610    
1611     Argument:
1612     ptr pointer to the initial [
1613     endptr where to return the end pointer
1614     cd pointer to compile data
1615    
1616     Returns: TRUE or FALSE
1617     */
1618    
1619     static BOOL
1620     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1621     {
1622     int terminator; /* Don't combine these lines; the Solaris cc */
1623     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1624     if (*(++ptr) == '^') ptr++;
1625     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1626     if (*ptr == terminator && ptr[1] == ']')
1627     {
1628     *endptr = ptr;
1629     return TRUE;
1630     }
1631     return FALSE;
1632     }
1633    
1634    
1635    
1636    
1637     /*************************************************
1638     * Check POSIX class name *
1639     *************************************************/
1640    
1641     /* This function is called to check the name given in a POSIX-style class entry
1642     such as [:alnum:].
1643    
1644     Arguments:
1645     ptr points to the first letter
1646     len the length of the name
1647    
1648     Returns: a value representing the name, or -1 if unknown
1649     */
1650    
1651     static int
1652     check_posix_name(const uschar *ptr, int len)
1653     {
1654     register int yield = 0;
1655     while (posix_name_lengths[yield] != 0)
1656     {
1657     if (len == posix_name_lengths[yield] &&
1658     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1659     yield++;
1660     }
1661     return -1;
1662     }
1663    
1664    
1665     /*************************************************
1666     * Adjust OP_RECURSE items in repeated group *
1667     *************************************************/
1668    
1669     /* OP_RECURSE items contain an offset from the start of the regex to the group
1670     that is referenced. This means that groups can be replicated for fixed
1671     repetition simply by copying (because the recursion is allowed to refer to
1672     earlier groups that are outside the current group). However, when a group is
1673     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1674     it, after it has been compiled. This means that any OP_RECURSE items within it
1675     that refer to the group itself or any contained groups have to have their
1676 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1677     the partially compiled regex must be temporarily terminated with OP_END.
1678 nigel 77
1679 nigel 93 This function has been extended with the possibility of forward references for
1680     recursions and subroutine calls. It must also check the list of such references
1681     for the group we are dealing with. If it finds that one of the recursions in
1682     the current group is on this list, it adjusts the offset in the list, not the
1683     value in the reference (which is a group number).
1684    
1685 nigel 77 Arguments:
1686     group points to the start of the group
1687     adjust the amount by which the group is to be moved
1688     utf8 TRUE in UTF-8 mode
1689     cd contains pointers to tables etc.
1690 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1691 nigel 77
1692     Returns: nothing
1693     */
1694    
1695     static void
1696 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1697     uschar *save_hwm)
1698 nigel 77 {
1699     uschar *ptr = group;
1700     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1701     {
1702 nigel 93 int offset;
1703     uschar *hc;
1704    
1705     /* See if this recursion is on the forward reference list. If so, adjust the
1706     reference. */
1707    
1708     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1709     {
1710     offset = GET(hc, 0);
1711     if (cd->start_code + offset == ptr + 1)
1712     {
1713     PUT(hc, 0, offset + adjust);
1714     break;
1715     }
1716     }
1717    
1718     /* Otherwise, adjust the recursion offset if it's after the start of this
1719     group. */
1720    
1721     if (hc >= cd->hwm)
1722     {
1723     offset = GET(ptr, 1);
1724     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1725     }
1726    
1727 nigel 77 ptr += 1 + LINK_SIZE;
1728     }
1729     }
1730    
1731    
1732    
1733     /*************************************************
1734     * Insert an automatic callout point *
1735     *************************************************/
1736    
1737     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1738     callout points before each pattern item.
1739    
1740     Arguments:
1741     code current code pointer
1742     ptr current pattern pointer
1743     cd pointers to tables etc
1744    
1745     Returns: new code pointer
1746     */
1747    
1748     static uschar *
1749     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1750     {
1751     *code++ = OP_CALLOUT;
1752     *code++ = 255;
1753     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1754     PUT(code, LINK_SIZE, 0); /* Default length */
1755     return code + 2*LINK_SIZE;
1756     }
1757    
1758    
1759    
1760     /*************************************************
1761     * Complete a callout item *
1762     *************************************************/
1763    
1764     /* A callout item contains the length of the next item in the pattern, which
1765     we can't fill in till after we have reached the relevant point. This is used
1766     for both automatic and manual callouts.
1767    
1768     Arguments:
1769     previous_callout points to previous callout item
1770     ptr current pattern pointer
1771     cd pointers to tables etc
1772    
1773     Returns: nothing
1774     */
1775    
1776     static void
1777     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1778     {
1779     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1780     PUT(previous_callout, 2 + LINK_SIZE, length);
1781     }
1782    
1783    
1784    
1785     #ifdef SUPPORT_UCP
1786     /*************************************************
1787     * Get othercase range *
1788     *************************************************/
1789    
1790     /* This function is passed the start and end of a class range, in UTF-8 mode
1791     with UCP support. It searches up the characters, looking for internal ranges of
1792     characters in the "other" case. Each call returns the next one, updating the
1793     start address.
1794    
1795     Arguments:
1796     cptr points to starting character value; updated
1797     d end value
1798     ocptr where to put start of othercase range
1799     odptr where to put end of othercase range
1800    
1801     Yield: TRUE when range returned; FALSE when no more
1802     */
1803    
1804     static BOOL
1805 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1806     unsigned int *odptr)
1807 nigel 77 {
1808 nigel 93 unsigned int c, othercase, next;
1809 nigel 77
1810     for (c = *cptr; c <= d; c++)
1811 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1812 nigel 77
1813     if (c > d) return FALSE;
1814    
1815     *ocptr = othercase;
1816     next = othercase + 1;
1817    
1818     for (++c; c <= d; c++)
1819     {
1820 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1821 nigel 77 next++;
1822     }
1823    
1824     *odptr = next - 1;
1825     *cptr = c;
1826    
1827     return TRUE;
1828     }
1829     #endif /* SUPPORT_UCP */
1830    
1831    
1832 nigel 93
1833 nigel 77 /*************************************************
1834 nigel 93 * Check if auto-possessifying is possible *
1835     *************************************************/
1836    
1837     /* This function is called for unlimited repeats of certain items, to see
1838     whether the next thing could possibly match the repeated item. If not, it makes
1839     sense to automatically possessify the repeated item.
1840    
1841     Arguments:
1842     op_code the repeated op code
1843     this data for this item, depends on the opcode
1844     utf8 TRUE in UTF-8 mode
1845     utf8_char used for utf8 character bytes, NULL if not relevant
1846     ptr next character in pattern
1847     options options bits
1848     cd contains pointers to tables etc.
1849    
1850     Returns: TRUE if possessifying is wanted
1851     */
1852    
1853     static BOOL
1854     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1855     const uschar *ptr, int options, compile_data *cd)
1856     {
1857     int next;
1858    
1859     /* Skip whitespace and comments in extended mode */
1860    
1861     if ((options & PCRE_EXTENDED) != 0)
1862     {
1863     for (;;)
1864     {
1865     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1866     if (*ptr == '#')
1867     {
1868     while (*(++ptr) != 0)
1869     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1870     }
1871     else break;
1872     }
1873     }
1874    
1875     /* If the next item is one that we can handle, get its value. A non-negative
1876     value is a character, a negative value is an escape value. */
1877    
1878     if (*ptr == '\\')
1879     {
1880     int temperrorcode = 0;
1881     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1882     if (temperrorcode != 0) return FALSE;
1883     ptr++; /* Point after the escape sequence */
1884     }
1885    
1886     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1887     {
1888     #ifdef SUPPORT_UTF8
1889     if (utf8) { GETCHARINC(next, ptr); } else
1890     #endif
1891     next = *ptr++;
1892     }
1893    
1894     else return FALSE;
1895    
1896     /* Skip whitespace and comments in extended mode */
1897    
1898     if ((options & PCRE_EXTENDED) != 0)
1899     {
1900     for (;;)
1901     {
1902     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1903     if (*ptr == '#')
1904     {
1905     while (*(++ptr) != 0)
1906     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1907     }
1908     else break;
1909     }
1910     }
1911    
1912     /* If the next thing is itself optional, we have to give up. */
1913    
1914     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1915     return FALSE;
1916    
1917     /* Now compare the next item with the previous opcode. If the previous is a
1918     positive single character match, "item" either contains the character or, if
1919     "item" is greater than 127 in utf8 mode, the character's bytes are in
1920     utf8_char. */
1921    
1922    
1923     /* Handle cases when the next item is a character. */
1924    
1925     if (next >= 0) switch(op_code)
1926     {
1927     case OP_CHAR:
1928     #ifdef SUPPORT_UTF8
1929     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1930     #endif
1931     return item != next;
1932    
1933     /* For CHARNC (caseless character) we must check the other case. If we have
1934     Unicode property support, we can use it to test the other case of
1935     high-valued characters. */
1936    
1937     case OP_CHARNC:
1938     #ifdef SUPPORT_UTF8
1939     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1940     #endif
1941     if (item == next) return FALSE;
1942     #ifdef SUPPORT_UTF8
1943     if (utf8)
1944     {
1945     unsigned int othercase;
1946     if (next < 128) othercase = cd->fcc[next]; else
1947     #ifdef SUPPORT_UCP
1948     othercase = _pcre_ucp_othercase((unsigned int)next);
1949     #else
1950     othercase = NOTACHAR;
1951     #endif
1952     return (unsigned int)item != othercase;
1953     }
1954     else
1955     #endif /* SUPPORT_UTF8 */
1956     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1957    
1958     /* For OP_NOT, "item" must be a single-byte character. */
1959    
1960     case OP_NOT:
1961     if (next < 0) return FALSE; /* Not a character */
1962     if (item == next) return TRUE;
1963     if ((options & PCRE_CASELESS) == 0) return FALSE;
1964     #ifdef SUPPORT_UTF8
1965     if (utf8)
1966     {
1967     unsigned int othercase;
1968     if (next < 128) othercase = cd->fcc[next]; else
1969     #ifdef SUPPORT_UCP
1970     othercase = _pcre_ucp_othercase(next);
1971     #else
1972     othercase = NOTACHAR;
1973     #endif
1974     return (unsigned int)item == othercase;
1975     }
1976     else
1977     #endif /* SUPPORT_UTF8 */
1978     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1979    
1980     case OP_DIGIT:
1981     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1982    
1983     case OP_NOT_DIGIT:
1984     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1985    
1986     case OP_WHITESPACE:
1987     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1988    
1989     case OP_NOT_WHITESPACE:
1990     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1991    
1992     case OP_WORDCHAR:
1993     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1994    
1995     case OP_NOT_WORDCHAR:
1996     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1997    
1998 ph10 180 case OP_HSPACE:
1999     case OP_NOT_HSPACE:
2000     switch(next)
2001     {
2002     case 0x09:
2003     case 0x20:
2004     case 0xa0:
2005     case 0x1680:
2006     case 0x180e:
2007     case 0x2000:
2008     case 0x2001:
2009     case 0x2002:
2010     case 0x2003:
2011     case 0x2004:
2012     case 0x2005:
2013     case 0x2006:
2014     case 0x2007:
2015     case 0x2008:
2016     case 0x2009:
2017     case 0x200A:
2018     case 0x202f:
2019     case 0x205f:
2020     case 0x3000:
2021     return op_code != OP_HSPACE;
2022     default:
2023     return op_code == OP_HSPACE;
2024     }
2025    
2026     case OP_VSPACE:
2027     case OP_NOT_VSPACE:
2028     switch(next)
2029     {
2030     case 0x0a:
2031     case 0x0b:
2032     case 0x0c:
2033     case 0x0d:
2034     case 0x85:
2035     case 0x2028:
2036     case 0x2029:
2037     return op_code != OP_VSPACE;
2038     default:
2039     return op_code == OP_VSPACE;
2040     }
2041    
2042 nigel 93 default:
2043     return FALSE;
2044     }
2045    
2046    
2047     /* Handle the case when the next item is \d, \s, etc. */
2048    
2049     switch(op_code)
2050     {
2051     case OP_CHAR:
2052     case OP_CHARNC:
2053     #ifdef SUPPORT_UTF8
2054     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055     #endif
2056     switch(-next)
2057     {
2058     case ESC_d:
2059     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2060    
2061     case ESC_D:
2062     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2063    
2064     case ESC_s:
2065     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2066    
2067     case ESC_S:
2068     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2069    
2070     case ESC_w:
2071     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2072    
2073     case ESC_W:
2074     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2075 ph10 182
2076 ph10 180 case ESC_h:
2077     case ESC_H:
2078     switch(item)
2079     {
2080     case 0x09:
2081     case 0x20:
2082     case 0xa0:
2083     case 0x1680:
2084     case 0x180e:
2085     case 0x2000:
2086     case 0x2001:
2087     case 0x2002:
2088     case 0x2003:
2089     case 0x2004:
2090     case 0x2005:
2091     case 0x2006:
2092     case 0x2007:
2093     case 0x2008:
2094     case 0x2009:
2095     case 0x200A:
2096     case 0x202f:
2097     case 0x205f:
2098     case 0x3000:
2099     return -next != ESC_h;
2100     default:
2101     return -next == ESC_h;
2102 ph10 182 }
2103    
2104 ph10 180 case ESC_v:
2105     case ESC_V:
2106     switch(item)
2107     {
2108     case 0x0a:
2109     case 0x0b:
2110     case 0x0c:
2111     case 0x0d:
2112     case 0x85:
2113     case 0x2028:
2114     case 0x2029:
2115     return -next != ESC_v;
2116     default:
2117     return -next == ESC_v;
2118 ph10 182 }
2119 nigel 93
2120     default:
2121     return FALSE;
2122     }
2123    
2124     case OP_DIGIT:
2125 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2126     next == -ESC_h || next == -ESC_v;
2127 nigel 93
2128     case OP_NOT_DIGIT:
2129     return next == -ESC_d;
2130    
2131     case OP_WHITESPACE:
2132     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2133    
2134     case OP_NOT_WHITESPACE:
2135 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2136 nigel 93
2137 ph10 180 case OP_HSPACE:
2138     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2139    
2140     case OP_NOT_HSPACE:
2141     return next == -ESC_h;
2142 ph10 182
2143 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2144 ph10 182 case OP_VSPACE:
2145 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2146    
2147     case OP_NOT_VSPACE:
2148 ph10 182 return next == -ESC_v;
2149 ph10 180
2150 nigel 93 case OP_WORDCHAR:
2151 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2152 nigel 93
2153     case OP_NOT_WORDCHAR:
2154     return next == -ESC_w || next == -ESC_d;
2155 ph10 182
2156 nigel 93 default:
2157     return FALSE;
2158     }
2159    
2160     /* Control does not reach here */
2161     }
2162    
2163    
2164    
2165     /*************************************************
2166 nigel 77 * Compile one branch *
2167     *************************************************/
2168    
2169 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2170 nigel 77 changed during the branch, the pointer is used to change the external options
2171 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2172     to find out the amount of memory needed, as well as during the real compile
2173     phase. The value of lengthptr distinguishes the two phases.
2174 nigel 77
2175     Arguments:
2176     optionsptr pointer to the option bits
2177     codeptr points to the pointer to the current code point
2178     ptrptr points to the current pattern pointer
2179     errorcodeptr points to error code variable
2180     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2181     reqbyteptr set to the last literal character required, else < 0
2182     bcptr points to current branch chain
2183     cd contains pointers to tables etc.
2184 nigel 93 lengthptr NULL during the real compile phase
2185     points to length accumulator during pre-compile phase
2186 nigel 77
2187     Returns: TRUE on success
2188     FALSE, with *errorcodeptr set non-zero on error
2189     */
2190    
2191     static BOOL
2192 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2193     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2194     compile_data *cd, int *lengthptr)
2195 nigel 77 {
2196     int repeat_type, op_type;
2197     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2198     int bravalue = 0;
2199     int greedy_default, greedy_non_default;
2200     int firstbyte, reqbyte;
2201     int zeroreqbyte, zerofirstbyte;
2202     int req_caseopt, reqvary, tempreqvary;
2203     int options = *optionsptr;
2204     int after_manual_callout = 0;
2205 nigel 93 int length_prevgroup = 0;
2206 nigel 77 register int c;
2207     register uschar *code = *codeptr;
2208 nigel 93 uschar *last_code = code;
2209     uschar *orig_code = code;
2210 nigel 77 uschar *tempcode;
2211     BOOL inescq = FALSE;
2212     BOOL groupsetfirstbyte = FALSE;
2213     const uschar *ptr = *ptrptr;
2214     const uschar *tempptr;
2215     uschar *previous = NULL;
2216     uschar *previous_callout = NULL;
2217 nigel 93 uschar *save_hwm = NULL;
2218 nigel 77 uschar classbits[32];
2219    
2220     #ifdef SUPPORT_UTF8
2221     BOOL class_utf8;
2222     BOOL utf8 = (options & PCRE_UTF8) != 0;
2223     uschar *class_utf8data;
2224     uschar utf8_char[6];
2225     #else
2226     BOOL utf8 = FALSE;
2227 nigel 93 uschar *utf8_char = NULL;
2228 nigel 77 #endif
2229    
2230 nigel 93 #ifdef DEBUG
2231     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2232     #endif
2233    
2234 nigel 77 /* Set up the default and non-default settings for greediness */
2235    
2236     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2237     greedy_non_default = greedy_default ^ 1;
2238    
2239     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2240     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2241     matches a non-fixed char first char; reqbyte just remains unset if we never
2242     find one.
2243    
2244     When we hit a repeat whose minimum is zero, we may have to adjust these values
2245     to take the zero repeat into account. This is implemented by setting them to
2246     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2247     item types that can be repeated set these backoff variables appropriately. */
2248    
2249     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2250    
2251     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2252     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2253     value > 255. It is added into the firstbyte or reqbyte variables to record the
2254     case status of the value. This is used only for ASCII characters. */
2255    
2256     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2257    
2258     /* Switch on next character until the end of the branch */
2259    
2260     for (;; ptr++)
2261     {
2262     BOOL negate_class;
2263     BOOL possessive_quantifier;
2264     BOOL is_quantifier;
2265 nigel 93 BOOL is_recurse;
2266 ph10 180 BOOL reset_bracount;
2267 nigel 77 int class_charcount;
2268     int class_lastchar;
2269     int newoptions;
2270     int recno;
2271 ph10 172 int refsign;
2272 nigel 77 int skipbytes;
2273     int subreqbyte;
2274     int subfirstbyte;
2275 nigel 93 int terminator;
2276 nigel 77 int mclength;
2277     uschar mcbuffer[8];
2278    
2279 nigel 93 /* Get next byte in the pattern */
2280 nigel 77
2281     c = *ptr;
2282    
2283 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2284     previous cycle of this loop. */
2285    
2286     if (lengthptr != NULL)
2287     {
2288     #ifdef DEBUG
2289     if (code > cd->hwm) cd->hwm = code; /* High water info */
2290     #endif
2291     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2292     {
2293     *errorcodeptr = ERR52;
2294     goto FAILED;
2295     }
2296    
2297     /* There is at least one situation where code goes backwards: this is the
2298     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2299     the class is simply eliminated. However, it is created first, so we have to
2300     allow memory for it. Therefore, don't ever reduce the length at this point.
2301     */
2302    
2303     if (code < last_code) code = last_code;
2304 ph10 202
2305     /* Paranoid check for integer overflow */
2306    
2307     if (OFLOW_MAX - *lengthptr < code - last_code)
2308     {
2309     *errorcodeptr = ERR20;
2310     goto FAILED;
2311     }
2312    
2313 nigel 93 *lengthptr += code - last_code;
2314     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2315    
2316     /* If "previous" is set and it is not at the start of the work space, move
2317     it back to there, in order to avoid filling up the work space. Otherwise,
2318     if "previous" is NULL, reset the current code pointer to the start. */
2319    
2320     if (previous != NULL)
2321     {
2322     if (previous > orig_code)
2323     {
2324     memmove(orig_code, previous, code - previous);
2325     code -= previous - orig_code;
2326     previous = orig_code;
2327     }
2328     }
2329     else code = orig_code;
2330    
2331     /* Remember where this code item starts so we can pick up the length
2332     next time round. */
2333    
2334     last_code = code;
2335     }
2336    
2337     /* In the real compile phase, just check the workspace used by the forward
2338     reference list. */
2339    
2340     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2341     {
2342     *errorcodeptr = ERR52;
2343     goto FAILED;
2344     }
2345    
2346 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2347    
2348     if (inescq && c != 0)
2349     {
2350     if (c == '\\' && ptr[1] == 'E')
2351     {
2352     inescq = FALSE;
2353     ptr++;
2354     continue;
2355     }
2356     else
2357     {
2358     if (previous_callout != NULL)
2359     {
2360 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2361     complete_callout(previous_callout, ptr, cd);
2362 nigel 77 previous_callout = NULL;
2363     }
2364     if ((options & PCRE_AUTO_CALLOUT) != 0)
2365     {
2366     previous_callout = code;
2367     code = auto_callout(code, ptr, cd);
2368     }
2369     goto NORMAL_CHAR;
2370     }
2371     }
2372    
2373     /* Fill in length of a previous callout, except when the next thing is
2374     a quantifier. */
2375    
2376     is_quantifier = c == '*' || c == '+' || c == '?' ||
2377     (c == '{' && is_counted_repeat(ptr+1));
2378    
2379     if (!is_quantifier && previous_callout != NULL &&
2380     after_manual_callout-- <= 0)
2381     {
2382 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2383     complete_callout(previous_callout, ptr, cd);
2384 nigel 77 previous_callout = NULL;
2385     }
2386    
2387     /* In extended mode, skip white space and comments */
2388    
2389     if ((options & PCRE_EXTENDED) != 0)
2390     {
2391     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2392     if (c == '#')
2393     {
2394 nigel 93 while (*(++ptr) != 0)
2395 nigel 91 {
2396 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2397 nigel 91 }
2398 nigel 93 if (*ptr != 0) continue;
2399    
2400 nigel 91 /* Else fall through to handle end of string */
2401     c = 0;
2402 nigel 77 }
2403     }
2404    
2405     /* No auto callout for quantifiers. */
2406    
2407     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2408     {
2409     previous_callout = code;
2410     code = auto_callout(code, ptr, cd);
2411     }
2412    
2413     switch(c)
2414     {
2415 nigel 93 /* ===================================================================*/
2416     case 0: /* The branch terminates at string end */
2417     case '|': /* or | or ) */
2418 nigel 77 case ')':
2419     *firstbyteptr = firstbyte;
2420     *reqbyteptr = reqbyte;
2421     *codeptr = code;
2422     *ptrptr = ptr;
2423 nigel 93 if (lengthptr != NULL)
2424     {
2425 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2426     {
2427     *errorcodeptr = ERR20;
2428     goto FAILED;
2429     }
2430 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2431     DPRINTF((">> end branch\n"));
2432     }
2433 nigel 77 return TRUE;
2434    
2435 nigel 93
2436     /* ===================================================================*/
2437 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2438     the setting of any following char as a first character. */
2439    
2440     case '^':
2441     if ((options & PCRE_MULTILINE) != 0)
2442     {
2443     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2444     }
2445     previous = NULL;
2446     *code++ = OP_CIRC;
2447     break;
2448    
2449     case '$':
2450     previous = NULL;
2451     *code++ = OP_DOLL;
2452     break;
2453    
2454     /* There can never be a first char if '.' is first, whatever happens about
2455     repeats. The value of reqbyte doesn't change either. */
2456    
2457     case '.':
2458     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2459     zerofirstbyte = firstbyte;
2460     zeroreqbyte = reqbyte;
2461     previous = code;
2462     *code++ = OP_ANY;
2463     break;
2464    
2465 nigel 93
2466     /* ===================================================================*/
2467 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2468     32-byte bitmap of the permitted characters, except in the special case
2469     where there is only one such character. For negated classes, we build the
2470     map as usual, then invert it at the end. However, we use a different opcode
2471     so that data characters > 255 can be handled correctly.
2472 nigel 77
2473     If the class contains characters outside the 0-255 range, a different
2474     opcode is compiled. It may optionally have a bit map for characters < 256,
2475     but those above are are explicitly listed afterwards. A flag byte tells
2476     whether the bitmap is present, and whether this is a negated class or not.
2477     */
2478    
2479     case '[':
2480     previous = code;
2481    
2482     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2483     they are encountered at the top level, so we'll do that too. */
2484    
2485     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2486     check_posix_syntax(ptr, &tempptr, cd))
2487     {
2488     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2489     goto FAILED;
2490     }
2491    
2492 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2493 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2494 ph10 205 skip them too. This makes for compatibility with Perl. */
2495 ph10 208
2496 ph10 205 negate_class = FALSE;
2497     for (;;)
2498 nigel 77 {
2499     c = *(++ptr);
2500 ph10 205 if (c == '\\')
2501     {
2502 ph10 208 if (ptr[1] == 'E') ptr++;
2503 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2504 ph10 208 else break;
2505 ph10 205 }
2506     else if (!negate_class && c == '^')
2507     negate_class = TRUE;
2508     else break;
2509 ph10 208 }
2510 nigel 77
2511     /* Keep a count of chars with values < 256 so that we can optimize the case
2512 nigel 93 of just a single character (as long as it's < 256). However, For higher
2513     valued UTF-8 characters, we don't yet do any optimization. */
2514 nigel 77
2515     class_charcount = 0;
2516     class_lastchar = -1;
2517    
2518 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2519     temporary bit of memory, in case the class contains only 1 character (less
2520     than 256), because in that case the compiled code doesn't use the bit map.
2521     */
2522    
2523     memset(classbits, 0, 32 * sizeof(uschar));
2524    
2525 nigel 77 #ifdef SUPPORT_UTF8
2526     class_utf8 = FALSE; /* No chars >= 256 */
2527 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2528 nigel 77 #endif
2529    
2530     /* Process characters until ] is reached. By writing this as a "do" it
2531 nigel 93 means that an initial ] is taken as a data character. At the start of the
2532     loop, c contains the first byte of the character. */
2533 nigel 77
2534 nigel 93 if (c != 0) do
2535 nigel 77 {
2536 nigel 93 const uschar *oldptr;
2537    
2538 nigel 77 #ifdef SUPPORT_UTF8
2539     if (utf8 && c > 127)
2540     { /* Braces are required because the */
2541     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2542     }
2543     #endif
2544    
2545     /* Inside \Q...\E everything is literal except \E */
2546    
2547     if (inescq)
2548     {
2549 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2550 nigel 77 {
2551 nigel 93 inescq = FALSE; /* Reset literal state */
2552     ptr++; /* Skip the 'E' */
2553     continue; /* Carry on with next */
2554 nigel 77 }
2555 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2556 nigel 77 }
2557    
2558     /* Handle POSIX class names. Perl allows a negation extension of the
2559     form [:^name:]. A square bracket that doesn't match the syntax is
2560     treated as a literal. We also recognize the POSIX constructions
2561     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2562     5.6 and 5.8 do. */
2563    
2564     if (c == '[' &&
2565     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2566     check_posix_syntax(ptr, &tempptr, cd))
2567     {
2568     BOOL local_negate = FALSE;
2569 nigel 87 int posix_class, taboffset, tabopt;
2570 nigel 77 register const uschar *cbits = cd->cbits;
2571 nigel 87 uschar pbits[32];
2572 nigel 77
2573     if (ptr[1] != ':')
2574     {
2575     *errorcodeptr = ERR31;
2576     goto FAILED;
2577     }
2578    
2579     ptr += 2;
2580     if (*ptr == '^')
2581     {
2582     local_negate = TRUE;
2583     ptr++;
2584     }
2585    
2586     posix_class = check_posix_name(ptr, tempptr - ptr);
2587     if (posix_class < 0)
2588     {
2589     *errorcodeptr = ERR30;
2590     goto FAILED;
2591     }
2592    
2593     /* If matching is caseless, upper and lower are converted to
2594     alpha. This relies on the fact that the class table starts with
2595     alpha, lower, upper as the first 3 entries. */
2596    
2597     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2598     posix_class = 0;
2599    
2600 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2601     because we may be adding and subtracting from it, and we don't want to
2602     subtract bits that may be in the main map already. At the end we or the
2603     result into the bit map that is being built. */
2604 nigel 77
2605     posix_class *= 3;
2606 nigel 87
2607     /* Copy in the first table (always present) */
2608    
2609     memcpy(pbits, cbits + posix_class_maps[posix_class],
2610     32 * sizeof(uschar));
2611    
2612     /* If there is a second table, add or remove it as required. */
2613    
2614     taboffset = posix_class_maps[posix_class + 1];
2615     tabopt = posix_class_maps[posix_class + 2];
2616    
2617     if (taboffset >= 0)
2618 nigel 77 {
2619 nigel 87 if (tabopt >= 0)
2620     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2621 nigel 77 else
2622 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2623 nigel 77 }
2624    
2625 nigel 87 /* Not see if we need to remove any special characters. An option
2626     value of 1 removes vertical space and 2 removes underscore. */
2627    
2628     if (tabopt < 0) tabopt = -tabopt;
2629     if (tabopt == 1) pbits[1] &= ~0x3c;
2630     else if (tabopt == 2) pbits[11] &= 0x7f;
2631    
2632     /* Add the POSIX table or its complement into the main table that is
2633     being built and we are done. */
2634    
2635     if (local_negate)
2636     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2637     else
2638     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2639    
2640 nigel 77 ptr = tempptr + 1;
2641     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2642     continue; /* End of POSIX syntax handling */
2643     }
2644    
2645     /* Backslash may introduce a single character, or it may introduce one
2646 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2647     case. Inside a class (and only there) it is treated as backspace.
2648     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2649 ph10 205 to 'or' into the one we are building. We assume they have more than one
2650 nigel 77 character in them, so set class_charcount bigger than one. */
2651    
2652     if (c == '\\')
2653     {
2654 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2655     if (*errorcodeptr != 0) goto FAILED;
2656 nigel 77
2657     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2658     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2659 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2660 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2661     {
2662     if (ptr[1] == '\\' && ptr[2] == 'E')
2663     {
2664     ptr += 2; /* avoid empty string */
2665     }
2666     else inescq = TRUE;
2667     continue;
2668     }
2669    
2670     if (c < 0)
2671     {
2672     register const uschar *cbits = cd->cbits;
2673     class_charcount += 2; /* Greater than 1 is what matters */
2674 nigel 93
2675     /* Save time by not doing this in the pre-compile phase. */
2676    
2677     if (lengthptr == NULL) switch (-c)
2678 nigel 77 {
2679     case ESC_d:
2680     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2681     continue;
2682    
2683     case ESC_D:
2684     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2685     continue;
2686    
2687     case ESC_w:
2688     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2689     continue;
2690    
2691     case ESC_W:
2692     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2693     continue;
2694    
2695     case ESC_s:
2696     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2697     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2698     continue;
2699    
2700     case ESC_S:
2701     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2702     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2703     continue;
2704    
2705 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2706     continue;
2707 ph10 180
2708 nigel 93 default: /* Not recognized; fall through */
2709     break; /* Need "default" setting to stop compiler warning. */
2710     }
2711    
2712     /* In the pre-compile phase, just do the recognition. */
2713    
2714     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2715     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2716 ph10 180
2717 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2718     they use extra memory. */
2719 ph10 180
2720 ph10 178 if (-c == ESC_h)
2721     {
2722     SETBIT(classbits, 0x09); /* VT */
2723     SETBIT(classbits, 0x20); /* SPACE */
2724 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2725 ph10 178 #ifdef SUPPORT_UTF8
2726     if (utf8)
2727 ph10 180 {
2728 ph10 178 class_utf8 = TRUE;
2729     *class_utf8data++ = XCL_SINGLE;
2730 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2731 ph10 178 *class_utf8data++ = XCL_SINGLE;
2732 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2733     *class_utf8data++ = XCL_RANGE;
2734     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2735     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2736 ph10 178 *class_utf8data++ = XCL_SINGLE;
2737 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2738 ph10 178 *class_utf8data++ = XCL_SINGLE;
2739 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2740 ph10 178 *class_utf8data++ = XCL_SINGLE;
2741 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2742     }
2743     #endif
2744     continue;
2745     }
2746 nigel 93
2747 ph10 178 if (-c == ESC_H)
2748     {
2749     for (c = 0; c < 32; c++)
2750     {
2751     int x = 0xff;
2752     switch (c)
2753 ph10 180 {
2754 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2755     case 0x20/8: x ^= 1 << (0x20%8); break;
2756     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2757     default: break;
2758     }
2759     classbits[c] |= x;
2760 ph10 180 }
2761    
2762 ph10 178 #ifdef SUPPORT_UTF8
2763     if (utf8)
2764 ph10 180 {
2765 ph10 178 class_utf8 = TRUE;
2766 ph10 180 *class_utf8data++ = XCL_RANGE;
2767     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2769     *class_utf8data++ = XCL_RANGE;
2770     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2771     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2772     *class_utf8data++ = XCL_RANGE;
2773     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2774     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2775     *class_utf8data++ = XCL_RANGE;
2776     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2777     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2778     *class_utf8data++ = XCL_RANGE;
2779     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2780     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2781     *class_utf8data++ = XCL_RANGE;
2782     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2783     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2784     *class_utf8data++ = XCL_RANGE;
2785     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2786     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2787     }
2788     #endif
2789     continue;
2790     }
2791 ph10 178
2792     if (-c == ESC_v)
2793     {
2794     SETBIT(classbits, 0x0a); /* LF */
2795     SETBIT(classbits, 0x0b); /* VT */
2796 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2797     SETBIT(classbits, 0x0d); /* CR */
2798     SETBIT(classbits, 0x85); /* NEL */
2799 ph10 178 #ifdef SUPPORT_UTF8
2800     if (utf8)
2801 ph10 180 {
2802 ph10 178 class_utf8 = TRUE;
2803 ph10 180 *class_utf8data++ = XCL_RANGE;
2804     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2805     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2806     }
2807     #endif
2808     continue;
2809     }
2810 ph10 178
2811     if (-c == ESC_V)
2812     {
2813     for (c = 0; c < 32; c++)
2814     {
2815     int x = 0xff;
2816     switch (c)
2817 ph10 180 {
2818 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2819     x ^= 1 << (0x0b%8);
2820     x ^= 1 << (0x0c%8);
2821 ph10 180 x ^= 1 << (0x0d%8);
2822 ph10 178 break;
2823     case 0x85/8: x ^= 1 << (0x85%8); break;
2824     default: break;
2825     }
2826     classbits[c] |= x;
2827 ph10 180 }
2828    
2829 ph10 178 #ifdef SUPPORT_UTF8
2830     if (utf8)
2831 ph10 180 {
2832 ph10 178 class_utf8 = TRUE;
2833 ph10 180 *class_utf8data++ = XCL_RANGE;
2834     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2835     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2836     *class_utf8data++ = XCL_RANGE;
2837     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2838     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2839     }
2840     #endif
2841     continue;
2842     }
2843 ph10 178
2844 nigel 93 /* We need to deal with \P and \p in both phases. */
2845    
2846 nigel 77 #ifdef SUPPORT_UCP
2847 nigel 93 if (-c == ESC_p || -c == ESC_P)
2848     {
2849     BOOL negated;
2850     int pdata;
2851     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2852     if (ptype < 0) goto FAILED;
2853     class_utf8 = TRUE;
2854     *class_utf8data++ = ((-c == ESC_p) != negated)?
2855     XCL_PROP : XCL_NOTPROP;
2856     *class_utf8data++ = ptype;
2857     *class_utf8data++ = pdata;
2858     class_charcount -= 2; /* Not a < 256 character */
2859 nigel 77 continue;
2860 nigel 93 }
2861 nigel 77 #endif
2862 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2863     strict mode. By default, for compatibility with Perl, they are
2864     treated as literals. */
2865 nigel 77
2866 nigel 93 if ((options & PCRE_EXTRA) != 0)
2867     {
2868     *errorcodeptr = ERR7;
2869     goto FAILED;
2870     }
2871 nigel 77
2872 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2873     c = *ptr; /* Get the final character and fall through */
2874 nigel 77 }
2875    
2876     /* Fall through if we have a single character (c >= 0). This may be
2877 nigel 93 greater than 256 in UTF-8 mode. */
2878 nigel 77
2879     } /* End of backslash handling */
2880    
2881     /* A single character may be followed by '-' to form a range. However,
2882     Perl does not permit ']' to be the end of the range. A '-' character
2883 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2884     entirely. The code for handling \Q and \E is messy. */
2885 nigel 77
2886 nigel 93 CHECK_RANGE:
2887     while (ptr[1] == '\\' && ptr[2] == 'E')
2888 nigel 77 {
2889 nigel 93 inescq = FALSE;
2890     ptr += 2;
2891     }
2892    
2893     oldptr = ptr;
2894    
2895     if (!inescq && ptr[1] == '-')
2896     {
2897 nigel 77 int d;
2898     ptr += 2;
2899 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2900 nigel 77
2901 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2902     mode. */
2903    
2904     while (*ptr == '\\' && ptr[1] == 'Q')
2905     {
2906     ptr += 2;
2907     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2908     inescq = TRUE;
2909     break;
2910     }
2911    
2912     if (*ptr == 0 || (!inescq && *ptr == ']'))
2913     {
2914     ptr = oldptr;
2915     goto LONE_SINGLE_CHARACTER;
2916     }
2917    
2918 nigel 77 #ifdef SUPPORT_UTF8
2919     if (utf8)
2920     { /* Braces are required because the */
2921     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2922     }
2923     else
2924     #endif
2925     d = *ptr; /* Not UTF-8 mode */
2926    
2927     /* The second part of a range can be a single-character escape, but
2928     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2929     in such circumstances. */
2930    
2931 nigel 93 if (!inescq && d == '\\')
2932 nigel 77 {
2933 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2934     if (*errorcodeptr != 0) goto FAILED;
2935 nigel 77
2936 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2937     special means the '-' was literal */
2938 nigel 77
2939     if (d < 0)
2940     {
2941     if (d == -ESC_b) d = '\b';
2942 nigel 93 else if (d == -ESC_X) d = 'X';
2943     else if (d == -ESC_R) d = 'R'; else
2944 nigel 77 {
2945 nigel 93 ptr = oldptr;
2946 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2947     }
2948     }
2949     }
2950    
2951 nigel 93 /* Check that the two values are in the correct order. Optimize
2952     one-character ranges */
2953 nigel 77
2954 nigel 93 if (d < c)
2955     {
2956     *errorcodeptr = ERR8;
2957     goto FAILED;
2958     }
2959    
2960 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2961    
2962     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2963     matching, we have to use an XCLASS with extra data items. Caseless
2964     matching for characters > 127 is available only if UCP support is
2965     available. */
2966    
2967     #ifdef SUPPORT_UTF8
2968     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2969     {
2970     class_utf8 = TRUE;
2971    
2972     /* With UCP support, we can find the other case equivalents of
2973     the relevant characters. There may be several ranges. Optimize how
2974     they fit with the basic range. */
2975    
2976     #ifdef SUPPORT_UCP
2977     if ((options & PCRE_CASELESS) != 0)
2978     {
2979 nigel 93 unsigned int occ, ocd;
2980     unsigned int cc = c;
2981     unsigned int origd = d;
2982 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2983     {
2984 ph10 180 if (occ >= (unsigned int)c &&
2985     ocd <= (unsigned int)d)
2986 ph10 176 continue; /* Skip embedded ranges */
2987 nigel 77
2988 ph10 180 if (occ < (unsigned int)c &&
2989 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2990 nigel 77 { /* if there is overlap, */
2991     c = occ; /* noting that if occ < c */
2992     continue; /* we can't have ocd > d */
2993     } /* because a subrange is */
2994 ph10 180 if (ocd > (unsigned int)d &&
2995 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2996 nigel 77 { /* the basic range. */
2997     d = ocd;
2998     continue;
2999     }
3000    
3001     if (occ == ocd)
3002     {
3003     *class_utf8data++ = XCL_SINGLE;
3004     }
3005     else
3006     {
3007     *class_utf8data++ = XCL_RANGE;
3008     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3009     }
3010     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3011     }
3012     }
3013     #endif /* SUPPORT_UCP */
3014    
3015     /* Now record the original range, possibly modified for UCP caseless
3016     overlapping ranges. */
3017    
3018     *class_utf8data++ = XCL_RANGE;
3019     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3020     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3021    
3022     /* With UCP support, we are done. Without UCP support, there is no
3023     caseless matching for UTF-8 characters > 127; we can use the bit map
3024     for the smaller ones. */
3025    
3026     #ifdef SUPPORT_UCP
3027     continue; /* With next character in the class */
3028     #else
3029     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3030    
3031     /* Adjust upper limit and fall through to set up the map */
3032    
3033     d = 127;
3034    
3035     #endif /* SUPPORT_UCP */
3036     }
3037     #endif /* SUPPORT_UTF8 */
3038    
3039     /* We use the bit map for all cases when not in UTF-8 mode; else
3040     ranges that lie entirely within 0-127 when there is UCP support; else
3041     for partial ranges without UCP support. */
3042    
3043 nigel 93 class_charcount += d - c + 1;
3044     class_lastchar = d;
3045    
3046     /* We can save a bit of time by skipping this in the pre-compile. */
3047    
3048     if (lengthptr == NULL) for (; c <= d; c++)
3049 nigel 77 {
3050     classbits[c/8] |= (1 << (c&7));
3051     if ((options & PCRE_CASELESS) != 0)
3052     {
3053     int uc = cd->fcc[c]; /* flip case */
3054     classbits[uc/8] |= (1 << (uc&7));
3055     }
3056     }
3057    
3058     continue; /* Go get the next char in the class */
3059     }
3060    
3061     /* Handle a lone single character - we can get here for a normal
3062     non-escape char, or after \ that introduces a single character or for an
3063     apparent range that isn't. */
3064    
3065     LONE_SINGLE_CHARACTER:
3066    
3067     /* Handle a character that cannot go in the bit map */
3068    
3069     #ifdef SUPPORT_UTF8
3070     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3071     {
3072     class_utf8 = TRUE;
3073     *class_utf8data++ = XCL_SINGLE;
3074     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3075    
3076     #ifdef SUPPORT_UCP
3077     if ((options & PCRE_CASELESS) != 0)
3078     {
3079 nigel 93 unsigned int othercase;
3080     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3081 nigel 77 {
3082     *class_utf8data++ = XCL_SINGLE;
3083     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3084     }
3085     }
3086     #endif /* SUPPORT_UCP */
3087    
3088     }
3089     else
3090     #endif /* SUPPORT_UTF8 */
3091    
3092     /* Handle a single-byte character */
3093     {
3094     classbits[c/8] |= (1 << (c&7));
3095     if ((options & PCRE_CASELESS) != 0)
3096     {
3097     c = cd->fcc[c]; /* flip case */
3098     classbits[c/8] |= (1 << (c&7));
3099     }
3100     class_charcount++;
3101     class_lastchar = c;
3102     }
3103     }
3104    
3105 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3106 nigel 77
3107 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3108 nigel 77
3109 nigel 93 if (c == 0) /* Missing terminating ']' */
3110     {
3111     *errorcodeptr = ERR6;
3112     goto FAILED;
3113     }
3114 ph10 208
3115 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3116     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3117     can optimize the negative case only if there were no characters >= 128
3118     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3119     single-bytes only. This is an historical hangover. Maybe one day we can
3120     tidy these opcodes to handle multi-byte characters.
3121    
3122     The optimization throws away the bit map. We turn the item into a
3123     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3124     that OP_NOT does not support multibyte characters. In the positive case, it
3125     can cause firstbyte to be set. Otherwise, there can be no first char if
3126     this item is first, whatever repeat count may follow. In the case of
3127     reqbyte, save the previous value for reinstating. */
3128    
3129     #ifdef SUPPORT_UTF8
3130     if (class_charcount == 1 &&
3131     (!utf8 ||
3132     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3133    
3134     #else
3135     if (class_charcount == 1)
3136     #endif
3137     {
3138     zeroreqbyte = reqbyte;
3139    
3140     /* The OP_NOT opcode works on one-byte characters only. */
3141    
3142     if (negate_class)
3143     {
3144     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3145     zerofirstbyte = firstbyte;
3146     *code++ = OP_NOT;
3147     *code++ = class_lastchar;
3148     break;
3149     }
3150    
3151     /* For a single, positive character, get the value into mcbuffer, and
3152     then we can handle this with the normal one-character code. */
3153    
3154     #ifdef SUPPORT_UTF8
3155     if (utf8 && class_lastchar > 127)
3156     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3157     else
3158     #endif
3159     {
3160     mcbuffer[0] = class_lastchar;
3161     mclength = 1;
3162     }
3163     goto ONE_CHAR;
3164     } /* End of 1-char optimization */
3165    
3166     /* The general case - not the one-char optimization. If this is the first
3167     thing in the branch, there can be no first char setting, whatever the
3168     repeat count. Any reqbyte setting must remain unchanged after any kind of
3169     repeat. */
3170    
3171     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3172     zerofirstbyte = firstbyte;
3173     zeroreqbyte = reqbyte;
3174    
3175     /* If there are characters with values > 255, we have to compile an
3176     extended class, with its own opcode. If there are no characters < 256,
3177 nigel 93 we can omit the bitmap in the actual compiled code. */
3178 nigel 77
3179     #ifdef SUPPORT_UTF8
3180     if (class_utf8)
3181     {
3182     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3183     *code++ = OP_XCLASS;
3184     code += LINK_SIZE;
3185     *code = negate_class? XCL_NOT : 0;
3186    
3187 nigel 93 /* If the map is required, move up the extra data to make room for it;
3188     otherwise just move the code pointer to the end of the extra data. */
3189 nigel 77
3190     if (class_charcount > 0)
3191     {
3192     *code++ |= XCL_MAP;
3193 nigel 93 memmove(code + 32, code, class_utf8data - code);
3194 nigel 77 memcpy(code, classbits, 32);
3195 nigel 93 code = class_utf8data + 32;
3196 nigel 77 }
3197 nigel 93 else code = class_utf8data;
3198 nigel 77
3199     /* Now fill in the complete length of the item */
3200    
3201     PUT(previous, 1, code - previous);
3202     break; /* End of class handling */
3203     }
3204     #endif
3205    
3206     /* If there are no characters > 255, negate the 32-byte map if necessary,
3207     and copy it into the code vector. If this is the first thing in the branch,
3208     there can be no first char setting, whatever the repeat count. Any reqbyte
3209     setting must remain unchanged after any kind of repeat. */
3210    
3211     if (negate_class)
3212     {
3213     *code++ = OP_NCLASS;
3214 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3215     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3216 nigel 77 }
3217     else
3218     {
3219     *code++ = OP_CLASS;
3220     memcpy(code, classbits, 32);
3221     }
3222     code += 32;
3223     break;
3224    
3225 nigel 93
3226     /* ===================================================================*/
3227 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3228     has been tested above. */
3229    
3230     case '{':
3231     if (!is_quantifier) goto NORMAL_CHAR;
3232     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3233     if (*errorcodeptr != 0) goto FAILED;
3234     goto REPEAT;
3235    
3236     case '*':
3237     repeat_min = 0;
3238     repeat_max = -1;
3239     goto REPEAT;
3240    
3241     case '+':
3242     repeat_min = 1;
3243     repeat_max = -1;
3244     goto REPEAT;
3245    
3246     case '?':
3247     repeat_min = 0;
3248     repeat_max = 1;
3249    
3250     REPEAT:
3251     if (previous == NULL)
3252     {
3253     *errorcodeptr = ERR9;
3254     goto FAILED;
3255     }
3256    
3257     if (repeat_min == 0)
3258     {
3259     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3260     reqbyte = zeroreqbyte; /* Ditto */
3261     }
3262    
3263     /* Remember whether this is a variable length repeat */
3264    
3265     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3266    
3267     op_type = 0; /* Default single-char op codes */
3268     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3269    
3270     /* Save start of previous item, in case we have to move it up to make space
3271     for an inserted OP_ONCE for the additional '+' extension. */
3272    
3273     tempcode = previous;
3274    
3275     /* If the next character is '+', we have a possessive quantifier. This
3276     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3277     If the next character is '?' this is a minimizing repeat, by default,
3278     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3279     repeat type to the non-default. */
3280    
3281     if (ptr[1] == '+')
3282     {
3283     repeat_type = 0; /* Force greedy */
3284     possessive_quantifier = TRUE;
3285     ptr++;
3286     }
3287     else if (ptr[1] == '?')
3288     {
3289     repeat_type = greedy_non_default;
3290     ptr++;
3291     }
3292     else repeat_type = greedy_default;
3293    
3294     /* If previous was a character match, abolish the item and generate a
3295     repeat item instead. If a char item has a minumum of more than one, ensure
3296     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3297     the first thing in a branch because the x will have gone into firstbyte
3298     instead. */
3299    
3300     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3301     {
3302     /* Deal with UTF-8 characters that take up more than one byte. It's
3303     easier to write this out separately than try to macrify it. Use c to
3304     hold the length of the character in bytes, plus 0x80 to flag that it's a
3305     length rather than a small character. */
3306    
3307     #ifdef SUPPORT_UTF8
3308     if (utf8 && (code[-1] & 0x80) != 0)
3309     {
3310     uschar *lastchar = code - 1;
3311     while((*lastchar & 0xc0) == 0x80) lastchar--;
3312     c = code - lastchar; /* Length of UTF-8 character */
3313     memcpy(utf8_char, lastchar, c); /* Save the char */
3314     c |= 0x80; /* Flag c as a length */
3315     }
3316     else
3317     #endif
3318    
3319     /* Handle the case of a single byte - either with no UTF8 support, or
3320     with UTF-8 disabled, or for a UTF-8 character < 128. */
3321    
3322     {
3323     c = code[-1];
3324     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3325     }
3326    
3327 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3328     the line is something that cannot possibly match this character. If so,
3329     automatically possessifying this item gains some performance in the case
3330     where the match fails. */
3331    
3332     if (!possessive_quantifier &&
3333     repeat_max < 0 &&
3334     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3335     options, cd))
3336     {
3337     repeat_type = 0; /* Force greedy */
3338     possessive_quantifier = TRUE;
3339     }
3340    
3341 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3342     }
3343    
3344     /* If previous was a single negated character ([^a] or similar), we use
3345     one of the special opcodes, replacing it. The code is shared with single-
3346     character repeats by setting opt_type to add a suitable offset into
3347 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3348     currently used only for single-byte chars. */
3349 nigel 77
3350     else if (*previous == OP_NOT)
3351     {
3352     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3353     c = previous[1];
3354 nigel 93 if (!possessive_quantifier &&
3355     repeat_max < 0 &&
3356     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3357     {
3358     repeat_type = 0; /* Force greedy */
3359     possessive_quantifier = TRUE;
3360     }
3361 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3362     }
3363    
3364     /* If previous was a character type match (\d or similar), abolish it and
3365     create a suitable repeat item. The code is shared with single-character
3366     repeats by setting op_type to add a suitable offset into repeat_type. Note
3367     the the Unicode property types will be present only when SUPPORT_UCP is
3368     defined, but we don't wrap the little bits of code here because it just
3369     makes it horribly messy. */
3370    
3371     else if (*previous < OP_EODN)
3372     {
3373     uschar *oldcode;
3374 nigel 87 int prop_type, prop_value;
3375 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3376     c = *previous;
3377    
3378 nigel 93 if (!possessive_quantifier &&
3379     repeat_max < 0 &&
3380     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3381     {
3382     repeat_type = 0; /* Force greedy */
3383     possessive_quantifier = TRUE;
3384     }
3385    
3386 nigel 77 OUTPUT_SINGLE_REPEAT:
3387 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3388     {
3389     prop_type = previous[1];
3390     prop_value = previous[2];
3391     }
3392     else prop_type = prop_value = -1;
3393 nigel 77
3394     oldcode = code;
3395     code = previous; /* Usually overwrite previous item */
3396    
3397     /* If the maximum is zero then the minimum must also be zero; Perl allows
3398     this case, so we do too - by simply omitting the item altogether. */
3399    
3400     if (repeat_max == 0) goto END_REPEAT;
3401    
3402     /* All real repeats make it impossible to handle partial matching (maybe
3403     one day we will be able to remove this restriction). */
3404    
3405     if (repeat_max != 1) cd->nopartial = TRUE;
3406    
3407     /* Combine the op_type with the repeat_type */
3408    
3409     repeat_type += op_type;
3410    
3411     /* A minimum of zero is handled either as the special case * or ?, or as
3412     an UPTO, with the maximum given. */
3413    
3414     if (repeat_min == 0)
3415     {
3416     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3417     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3418     else
3419     {
3420     *code++ = OP_UPTO + repeat_type;
3421     PUT2INC(code, 0, repeat_max);
3422     }
3423     }
3424    
3425     /* A repeat minimum of 1 is optimized into some special cases. If the
3426 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3427 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3428     one less than the maximum. */
3429    
3430     else if (repeat_min == 1)
3431     {
3432     if (repeat_max == -1)
3433     *code++ = OP_PLUS + repeat_type;
3434     else
3435     {
3436     code = oldcode; /* leave previous item in place */
3437     if (repeat_max == 1) goto END_REPEAT;
3438     *code++ = OP_UPTO + repeat_type;
3439     PUT2INC(code, 0, repeat_max - 1);
3440     }
3441     }
3442    
3443     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3444     handled as an EXACT followed by an UPTO. */
3445    
3446     else
3447     {
3448     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3449     PUT2INC(code, 0, repeat_min);
3450    
3451     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3452     we have to insert the character for the previous code. For a repeated
3453 nigel 87 Unicode property match, there are two extra bytes that define the
3454 nigel 77 required property. In UTF-8 mode, long characters have their length in
3455     c, with the 0x80 bit as a flag. */
3456    
3457     if (repeat_max < 0)
3458     {
3459     #ifdef SUPPORT_UTF8
3460     if (utf8 && c >= 128)
3461     {
3462     memcpy(code, utf8_char, c & 7);
3463     code += c & 7;
3464     }
3465     else
3466     #endif
3467     {
3468     *code++ = c;
3469 nigel 87 if (prop_type >= 0)
3470     {
3471     *code++ = prop_type;
3472     *code++ = prop_value;
3473     }
3474 nigel 77 }
3475     *code++ = OP_STAR + repeat_type;
3476     }
3477    
3478     /* Else insert an UPTO if the max is greater than the min, again
3479 nigel 93 preceded by the character, for the previously inserted code. If the
3480     UPTO is just for 1 instance, we can use QUERY instead. */
3481 nigel 77
3482     else if (repeat_max != repeat_min)
3483     {
3484     #ifdef SUPPORT_UTF8
3485     if (utf8 && c >= 128)
3486     {
3487     memcpy(code, utf8_char, c & 7);
3488     code += c & 7;
3489     }
3490     else
3491     #endif
3492     *code++ = c;
3493 nigel 87 if (prop_type >= 0)
3494     {
3495     *code++ = prop_type;
3496     *code++ = prop_value;
3497     }
3498 nigel 77 repeat_max -= repeat_min;
3499 nigel 93
3500     if (repeat_max == 1)
3501     {
3502     *code++ = OP_QUERY + repeat_type;
3503     }
3504     else
3505     {
3506     *code++ = OP_UPTO + repeat_type;
3507     PUT2INC(code, 0, repeat_max);
3508     }
3509 nigel 77 }
3510     }
3511    
3512     /* The character or character type itself comes last in all cases. */
3513    
3514     #ifdef SUPPORT_UTF8
3515     if (utf8 && c >= 128)
3516     {
3517     memcpy(code, utf8_char, c & 7);
3518     code += c & 7;
3519     }
3520     else
3521     #endif
3522     *code++ = c;
3523    
3524 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3525     define the required property. */
3526 nigel 77
3527     #ifdef SUPPORT_UCP
3528 nigel 87 if (prop_type >= 0)
3529     {
3530     *code++ = prop_type;
3531     *code++ = prop_value;
3532     }
3533 nigel 77 #endif
3534     }
3535    
3536     /* If previous was a character class or a back reference, we put the repeat
3537     stuff after it, but just skip the item if the repeat was {0,0}. */
3538    
3539     else if (*previous == OP_CLASS ||
3540     *previous == OP_NCLASS ||
3541     #ifdef SUPPORT_UTF8
3542     *previous == OP_XCLASS ||
3543     #endif
3544     *previous == OP_REF)
3545     {
3546     if (repeat_max == 0)
3547     {
3548     code = previous;
3549     goto END_REPEAT;
3550     }
3551    
3552     /* All real repeats make it impossible to handle partial matching (maybe
3553     one day we will be able to remove this restriction). */
3554    
3555     if (repeat_max != 1) cd->nopartial = TRUE;
3556    
3557     if (repeat_min == 0 && repeat_max == -1)
3558     *code++ = OP_CRSTAR + repeat_type;
3559     else if (repeat_min == 1 && repeat_max == -1)
3560     *code++ = OP_CRPLUS + repeat_type;
3561     else if (repeat_min == 0 && repeat_max == 1)
3562     *code++ = OP_CRQUERY + repeat_type;
3563     else
3564     {
3565     *code++ = OP_CRRANGE + repeat_type;
3566     PUT2INC(code, 0, repeat_min);
3567     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3568     PUT2INC(code, 0, repeat_max);
3569     }
3570     }
3571    
3572     /* If previous was a bracket group, we may have to replicate it in certain
3573     cases. */
3574    
3575 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3576     *previous == OP_ONCE || *previous == OP_COND)
3577 nigel 77 {
3578     register int i;
3579     int ketoffset = 0;
3580     int len = code - previous;
3581     uschar *bralink = NULL;
3582    
3583 nigel 93 /* Repeating a DEFINE group is pointless */
3584    
3585     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3586     {
3587     *errorcodeptr = ERR55;
3588     goto FAILED;
3589     }
3590    
3591 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3592     by scanning through from the start, and compute the offset back to it
3593     from the current code pointer. There may be an OP_OPT setting following
3594     the final KET, so we can't find the end just by going back from the code
3595     pointer. */
3596    
3597     if (repeat_max == -1)
3598     {
3599     register uschar *ket = previous;
3600     do ket += GET(ket, 1); while (*ket != OP_KET);
3601     ketoffset = code - ket;
3602     }
3603    
3604     /* The case of a zero minimum is special because of the need to stick
3605     OP_BRAZERO in front of it, and because the group appears once in the
3606     data, whereas in other cases it appears the minimum number of times. For
3607     this reason, it is simplest to treat this case separately, as otherwise
3608     the code gets far too messy. There are several special subcases when the
3609     minimum is zero. */
3610    
3611     if (repeat_min == 0)
3612     {
3613     /* If the maximum is also zero, we just omit the group from the output
3614     altogether. */
3615    
3616     if (repeat_max == 0)
3617     {
3618     code = previous;
3619     goto END_REPEAT;
3620     }
3621    
3622     /* If the maximum is 1 or unlimited, we just have to stick in the
3623     BRAZERO and do no more at this point. However, we do need to adjust
3624     any OP_RECURSE calls inside the group that refer to the group itself or
3625 nigel 93 any internal or forward referenced group, because the offset is from
3626     the start of the whole regex. Temporarily terminate the pattern while
3627     doing this. */
3628 nigel 77
3629     if (repeat_max <= 1)
3630     {
3631     *code = OP_END;
3632 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3633 nigel 77 memmove(previous+1, previous, len);
3634     code++;
3635     *previous++ = OP_BRAZERO + repeat_type;
3636     }
3637    
3638     /* If the maximum is greater than 1 and limited, we have to replicate
3639     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3640     The first one has to be handled carefully because it's the original
3641     copy, which has to be moved up. The remainder can be handled by code
3642     that is common with the non-zero minimum case below. We have to
3643     adjust the value or repeat_max, since one less copy is required. Once
3644     again, we may have to adjust any OP_RECURSE calls inside the group. */
3645    
3646     else
3647     {
3648     int offset;
3649     *code = OP_END;
3650 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3651 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3652     code += 2 + LINK_SIZE;
3653     *previous++ = OP_BRAZERO + repeat_type;
3654     *previous++ = OP_BRA;
3655    
3656     /* We chain together the bracket offset fields that have to be
3657     filled in later when the ends of the brackets are reached. */
3658    
3659     offset = (bralink == NULL)? 0 : previous - bralink;
3660     bralink = previous;
3661     PUTINC(previous, 0, offset);
3662     }
3663    
3664     repeat_max--;
3665     }
3666    
3667     /* If the minimum is greater than zero, replicate the group as many
3668     times as necessary, and adjust the maximum to the number of subsequent
3669     copies that we need. If we set a first char from the group, and didn't
3670 nigel 93 set a required char, copy the latter from the former. If there are any
3671     forward reference subroutine calls in the group, there will be entries on
3672     the workspace list; replicate these with an appropriate increment. */
3673 nigel 77
3674     else
3675     {
3676     if (repeat_min > 1)
3677     {
3678 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3679 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3680     potential integer overflow. */
3681 nigel 93
3682     if (lengthptr != NULL)
3683 ph10 202 {
3684     int delta = (repeat_min - 1)*length_prevgroup;
3685     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3686     (double)INT_MAX ||
3687     OFLOW_MAX - *lengthptr < delta)
3688     {
3689     *errorcodeptr = ERR20;
3690     goto FAILED;
3691     }
3692     *lengthptr += delta;
3693     }
3694 nigel 93
3695     /* This is compiling for real */
3696    
3697     else
3698 nigel 77 {
3699 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3700     for (i = 1; i < repeat_min; i++)
3701     {
3702     uschar *hc;
3703     uschar *this_hwm = cd->hwm;
3704     memcpy(code, previous, len);
3705     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3706     {
3707     PUT(cd->hwm, 0, GET(hc, 0) + len);
3708     cd->hwm += LINK_SIZE;
3709     }
3710     save_hwm = this_hwm;
3711     code += len;
3712     }
3713 nigel 77 }
3714     }
3715 nigel 93
3716 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3717     }
3718    
3719     /* This code is common to both the zero and non-zero minimum cases. If
3720     the maximum is limited, it replicates the group in a nested fashion,
3721     remembering the bracket starts on a stack. In the case of a zero minimum,
3722     the first one was set up above. In all cases the repeat_max now specifies
3723 nigel 93 the number of additional copies needed. Again, we must remember to
3724     replicate entries on the forward reference list. */
3725 nigel 77
3726     if (repeat_max >= 0)
3727     {
3728 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3729     just adjust the length as if we had. For each repetition we must add 1
3730     to the length for BRAZERO and for all but the last repetition we must
3731 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3732     paranoid checks to avoid integer overflow. */
3733 nigel 93
3734     if (lengthptr != NULL && repeat_max > 0)
3735 ph10 202 {
3736     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3737     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3738     if ((double)repeat_max *
3739     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3740     > (double)INT_MAX ||
3741     OFLOW_MAX - *lengthptr < delta)
3742     {
3743     *errorcodeptr = ERR20;
3744     goto FAILED;
3745     }
3746     *lengthptr += delta;
3747     }
3748 nigel 93
3749     /* This is compiling for real */
3750    
3751     else for (i = repeat_max - 1; i >= 0; i--)
3752 nigel 77 {
3753 nigel 93 uschar *hc;
3754     uschar *this_hwm = cd->hwm;
3755    
3756 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3757    
3758     /* All but the final copy start a new nesting, maintaining the
3759     chain of brackets outstanding. */
3760    
3761     if (i != 0)
3762     {
3763     int offset;
3764     *code++ = OP_BRA;
3765     offset = (bralink == NULL)? 0 : code - bralink;
3766     bralink = code;
3767     PUTINC(code, 0, offset);
3768     }
3769    
3770     memcpy(code, previous, len);
3771 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3772     {
3773     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3774     cd->hwm += LINK_SIZE;
3775     }
3776     save_hwm = this_hwm;
3777 nigel 77 code += len;
3778     }
3779    
3780     /* Now chain through the pending brackets, and fill in their length
3781     fields (which are holding the chain links pro tem). */
3782    
3783     while (bralink != NULL)
3784     {
3785     int oldlinkoffset;
3786     int offset = code - bralink + 1;
3787     uschar *bra = code - offset;
3788     oldlinkoffset = GET(bra, 1);
3789     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3790     *code++ = OP_KET;
3791     PUTINC(code, 0, offset);
3792     PUT(bra, 1, offset);
3793     }
3794     }
3795    
3796     /* If the maximum is unlimited, set a repeater in the final copy. We
3797     can't just offset backwards from the current code point, because we
3798     don't know if there's been an options resetting after the ket. The
3799 nigel 93 correct offset was computed above.
3800 nigel 77
3801 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3802     this group is a non-atomic one that could match an empty string. If so,
3803     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3804     that runtime checking can be done. [This check is also applied to
3805     atomic groups at runtime, but in a different way.] */
3806    
3807     else
3808     {
3809     uschar *ketcode = code - ketoffset;
3810     uschar *bracode = ketcode - GET(ketcode, 1);
3811     *ketcode = OP_KETRMAX + repeat_type;
3812     if (lengthptr == NULL && *bracode != OP_ONCE)
3813     {
3814     uschar *scode = bracode;
3815     do
3816     {
3817     if (could_be_empty_branch(scode, ketcode, utf8))
3818     {
3819     *bracode += OP_SBRA - OP_BRA;
3820     break;
3821     }
3822     scode += GET(scode, 1);
3823     }
3824     while (*scode == OP_ALT);
3825     }
3826     }
3827 nigel 77 }
3828    
3829     /* Else there's some kind of shambles */
3830    
3831     else
3832     {
3833     *errorcodeptr = ERR11;
3834     goto FAILED;
3835     }
3836    
3837 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3838     tests above succeeded, possessive_quantifier is TRUE. For some of the
3839     simpler opcodes, there is an special alternative opcode for this. For
3840     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3841     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3842     but the special opcodes can optimize it a bit. The repeated item starts at
3843     tempcode, not at previous, which might be the first part of a string whose
3844     (former) last char we repeated.
3845 nigel 77
3846 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3847     an 'upto' may follow. We skip over an 'exact' item, and then test the
3848     length of what remains before proceeding. */
3849    
3850 nigel 77 if (possessive_quantifier)
3851     {
3852 nigel 93 int len;
3853     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3854     *tempcode == OP_NOTEXACT)
3855     tempcode += _pcre_OP_lengths[*tempcode];
3856     len = code - tempcode;
3857     if (len > 0) switch (*tempcode)
3858     {
3859     case OP_STAR: *tempcode = OP_POSSTAR; break;
3860     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3861     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3862     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3863    
3864     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3865     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3866     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3867     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3868    
3869     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3870     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3871     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3872     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3873    
3874     default:
3875     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3876     code += 1 + LINK_SIZE;
3877     len += 1 + LINK_SIZE;
3878     tempcode[0] = OP_ONCE;
3879     *code++ = OP_KET;
3880     PUTINC(code, 0, len);
3881     PUT(tempcode, 1, len);
3882     break;
3883     }
3884 nigel 77 }
3885    
3886     /* In all case we no longer have a previous item. We also set the
3887     "follows varying string" flag for subsequently encountered reqbytes if
3888     it isn't already set and we have just passed a varying length item. */
3889    
3890     END_REPEAT:
3891     previous = NULL;
3892     cd->req_varyopt |= reqvary;
3893     break;
3894    
3895    
3896 nigel 93 /* ===================================================================*/
3897     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3898     lookbehind or option setting or condition or all the other extended
3899 ph10 210 parenthesis forms. */
3900 nigel 77
3901     case '(':
3902     newoptions = options;
3903     skipbytes = 0;
3904 nigel 93 bravalue = OP_CBRA;
3905     save_hwm = cd->hwm;
3906 ph10 180 reset_bracount = FALSE;
3907 ph10 211
3908 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3909 ph10 211
3910 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3911     {
3912 ph10 211 int i, namelen;
3913 ph10 210 const uschar *name = ++ptr;
3914     previous = NULL;
3915     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3916     if (*ptr == ':')
3917     {
3918     *errorcodeptr = ERR59; /* Not supported */
3919 ph10 211 goto FAILED;
3920     }
3921 ph10 210 if (*ptr != ')')
3922     {
3923     *errorcodeptr = ERR60;
3924     goto FAILED;
3925     }
3926 ph10 211 namelen = ptr - name;
3927 ph10 210 for (i = 0; i < verbcount; i++)
3928 ph10 211 {
3929 ph10 210 if (namelen == verbs[i].len &&
3930     strncmp((char *)name, verbs[i].name, namelen) == 0)
3931     {
3932     *code = verbs[i].op;
3933     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3934     break;
3935 ph10 211 }
3936     }
3937     if (i < verbcount) continue;
3938 ph10 210 *errorcodeptr = ERR60;
3939 ph10 211 goto FAILED;
3940     }
3941    
3942 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
3943     appearance of any of them means that this is not a capturing group. */
3944 nigel 77
3945 ph10 210 else if (*ptr == '?')
3946 nigel 77 {
3947 nigel 93 int i, set, unset, namelen;
3948 nigel 77 int *optset;
3949 nigel 93 const uschar *name;
3950     uschar *slot;
3951 nigel 77
3952     switch (*(++ptr))
3953     {
3954     case '#': /* Comment; skip to ket */
3955     ptr++;
3956 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3957     if (*ptr == 0)
3958     {
3959     *errorcodeptr = ERR18;
3960     goto FAILED;
3961     }
3962 nigel 77 continue;
3963    
3964 nigel 93
3965     /* ------------------------------------------------------------ */
3966 ph10 175 case '|': /* Reset capture count for each branch */
3967     reset_bracount = TRUE;
3968 ph10 180 /* Fall through */
3969 ph10 175
3970     /* ------------------------------------------------------------ */
3971 nigel 93 case ':': /* Non-capturing bracket */
3972 nigel 77 bravalue = OP_BRA;
3973     ptr++;
3974     break;
3975    
3976 nigel 93
3977     /* ------------------------------------------------------------ */
3978 nigel 77 case '(':
3979     bravalue = OP_COND; /* Conditional group */
3980    
3981 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3982     group), a name (referring to a named group), or 'R', referring to
3983     recursion. R<digits> and R&name are also permitted for recursion tests.
3984 nigel 77
3985 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3986     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3987    
3988     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3989     be the recursive thing or the name 'R' (and similarly for 'R' followed
3990     by digits), and (b) a number could be a name that consists of digits.
3991     In both cases, we look for a name first; if not found, we try the other
3992     cases. */
3993    
3994     /* For conditions that are assertions, check the syntax, and then exit
3995     the switch. This will take control down to where bracketed groups,
3996     including assertions, are processed. */
3997    
3998     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3999     break;
4000    
4001     /* Most other conditions use OP_CREF (a couple change to OP_RREF
4002     below), and all need to skip 3 bytes at the start of the group. */
4003    
4004     code[1+LINK_SIZE] = OP_CREF;
4005     skipbytes = 3;
4006 ph10 172 refsign = -1;
4007 nigel 93
4008     /* Check for a test for recursion in a named group. */
4009    
4010     if (ptr[1] == 'R' && ptr[2] == '&')
4011 nigel 77 {
4012 nigel 93 terminator = -1;
4013     ptr += 2;
4014     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4015     }
4016 nigel 91
4017 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4018     syntax (?(<name>) or (?('name') */
4019 nigel 91
4020 nigel 93 else if (ptr[1] == '<')
4021     {
4022     terminator = '>';
4023     ptr++;
4024     }
4025     else if (ptr[1] == '\'')
4026     {
4027     terminator = '\'';
4028     ptr++;
4029     }
4030 ph10 172 else
4031 ph10 167 {
4032     terminator = 0;
4033 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4034     }
4035 nigel 77
4036 nigel 93 /* We now expect to read a name; any thing else is an error */
4037 nigel 77
4038 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4039     {
4040     ptr += 1; /* To get the right offset */
4041     *errorcodeptr = ERR28;
4042     goto FAILED;
4043     }
4044    
4045     /* Read the name, but also get it as a number if it's all digits */
4046    
4047     recno = 0;
4048     name = ++ptr;
4049     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4050     {
4051     if (recno >= 0)
4052     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4053     recno * 10 + *ptr - '0' : -1;
4054 nigel 91 ptr++;
4055 nigel 93 }
4056     namelen = ptr - name;
4057 nigel 91
4058 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4059     {
4060     ptr--; /* Error offset */
4061     *errorcodeptr = ERR26;
4062     goto FAILED;
4063     }
4064 nigel 91
4065 nigel 93 /* Do no further checking in the pre-compile phase. */
4066 nigel 91
4067 nigel 93 if (lengthptr != NULL) break;
4068 nigel 91
4069 nigel 93 /* In the real compile we do the work of looking for the actual
4070 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4071     be digits, in which case recno will be set. */
4072 ph10 172
4073 ph10 167 if (refsign > 0)
4074     {
4075     if (recno <= 0)
4076     {
4077     *errorcodeptr = ERR58;
4078     goto FAILED;
4079 ph10 172 }
4080 ph10 167 if (refsign == '-')
4081     {
4082 ph10