/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 215 - (hide annotations) (download)
Wed Aug 15 14:20:05 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 187512 byte(s)
Fixed overrun for missing ] with a forward reference, e.g. /(?1)\c[/.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285 ph10 213 "(*VERB) not recognized",
286     "number is too big"
287 nigel 77 };
288    
289    
290     /* Table to identify digits and hex digits. This is used when compiling
291     patterns. Note that the tables in chartables are dependent on the locale, and
292     may mark arbitrary characters as digits - but the PCRE compiling code expects
293     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294     a private table here. It costs 256 bytes, but it is a lot faster than doing
295     character value tests (at least in some simple cases I timed), and in some
296     applications one wants PCRE to compile efficiently as well as match
297     efficiently.
298    
299     For convenience, we use the same bit definitions as in chartables:
300    
301     0x04 decimal digit
302     0x08 hexadecimal digit
303    
304     Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 nigel 77 static const unsigned char digitab[] =
308     {
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
343 nigel 77 static const unsigned char digitab[] =
344     {
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377    
378     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411     #endif
412    
413    
414     /* Definition to allow mutual recursion */
415    
416     static BOOL
417 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
419 nigel 77
420    
421    
422     /*************************************************
423     * Handle escapes *
424     *************************************************/
425    
426     /* This function is called when a \ has been encountered. It either returns a
427     positive value for a simple escape such as \n, or a negative value which
428 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
429     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431     ptr is pointing at the \. On exit, it is on the final character of the escape
432     sequence.
433 nigel 77
434     Arguments:
435     ptrptr points to the pattern position pointer
436     errorcodeptr points to the errorcode variable
437     bracount number of previous extracting brackets
438     options the options bits
439     isclass TRUE if inside a character class
440    
441     Returns: zero or positive => a data character
442     negative => a special escape sequence
443 ph10 213 on error, errorcodeptr is set
444 nigel 77 */
445    
446     static int
447     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448     int options, BOOL isclass)
449     {
450 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
451     const uschar *ptr = *ptrptr + 1;
452 nigel 77 int c, i;
453    
454 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455     ptr--; /* Set pointer back to the last byte */
456    
457 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
458    
459     if (c == 0) *errorcodeptr = ERR1;
460    
461     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462     a table. A non-zero result is something that can be returned immediately.
463     Otherwise further processing may be required. */
464    
465 ph10 97 #ifndef EBCDIC /* ASCII coding */
466 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
467     else if ((i = escapes[c - '0']) != 0) c = i;
468    
469 ph10 97 #else /* EBCDIC coding */
470 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471     else if ((i = escapes[c - 0x48]) != 0) c = i;
472     #endif
473    
474     /* Escapes that need further processing, or are illegal. */
475    
476     else
477     {
478     const uschar *oldptr;
479 nigel 93 BOOL braced, negated;
480    
481 nigel 77 switch (c)
482     {
483     /* A number of Perl escapes are not handled by PCRE. We give an explicit
484     error. */
485    
486     case 'l':
487     case 'L':
488     case 'N':
489     case 'u':
490     case 'U':
491     *errorcodeptr = ERR37;
492     break;
493    
494 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
495     is an absolute backreference. If negative, it is a relative backreference.
496 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497     reference to a named group. This is part of Perl's movement towards a
498     unified syntax for back references. As this is synonymous with \k{name}, we
499 ph10 171 fudge it up by pretending it really was \k. */
500 nigel 93
501     case 'g':
502     if (ptr[1] == '{')
503     {
504 ph10 171 const uschar *p;
505     for (p = ptr+2; *p != 0 && *p != '}'; p++)
506     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 ph10 172 if (*p != 0 && *p != '}')
508 ph10 171 {
509     c = -ESC_k;
510     break;
511 ph10 172 }
512 nigel 93 braced = TRUE;
513     ptr++;
514     }
515     else braced = FALSE;
516    
517     if (ptr[1] == '-')
518     {
519     negated = TRUE;
520     ptr++;
521     }
522     else negated = FALSE;
523    
524     c = 0;
525     while ((digitab[ptr[1]] & ctype_digit) != 0)
526     c = c * 10 + *(++ptr) - '0';
527 ph10 213
528     if (c < 0)
529     {
530     *errorcodeptr = ERR61;
531     break;
532     }
533 nigel 93
534     if (c == 0 || (braced && *(++ptr) != '}'))
535     {
536     *errorcodeptr = ERR57;
537 ph10 213 break;
538 nigel 93 }
539    
540     if (negated)
541     {
542     if (c > bracount)
543     {
544     *errorcodeptr = ERR15;
545 ph10 213 break;
546 nigel 93 }
547     c = bracount - (c - 1);
548     }
549    
550     c = -(ESC_REF + c);
551     break;
552    
553 nigel 77 /* The handling of escape sequences consisting of a string of digits
554     starting with one that is not zero is not straightforward. By experiment,
555     the way Perl works seems to be as follows:
556    
557     Outside a character class, the digits are read as a decimal number. If the
558     number is less than 10, or if there are that many previous extracting
559     left brackets, then it is a back reference. Otherwise, up to three octal
560     digits are read to form an escaped byte. Thus \123 is likely to be octal
561     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562     value is greater than 377, the least significant 8 bits are taken. Inside a
563     character class, \ followed by a digit is always an octal number. */
564    
565     case '1': case '2': case '3': case '4': case '5':
566     case '6': case '7': case '8': case '9':
567    
568     if (!isclass)
569     {
570     oldptr = ptr;
571     c -= '0';
572     while ((digitab[ptr[1]] & ctype_digit) != 0)
573     c = c * 10 + *(++ptr) - '0';
574 ph10 213 if (c < 0)
575     {
576     *errorcodeptr = ERR61;
577     break;
578     }
579 nigel 77 if (c < 10 || c <= bracount)
580     {
581     c = -(ESC_REF + c);
582     break;
583     }
584     ptr = oldptr; /* Put the pointer back and fall through */
585     }
586    
587     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588     generates a binary zero byte and treats the digit as a following literal.
589     Thus we have to pull back the pointer by one. */
590    
591     if ((c = *ptr) >= '8')
592     {
593     ptr--;
594     c = 0;
595     break;
596     }
597    
598     /* \0 always starts an octal number, but we may drop through to here with a
599 nigel 91 larger first octal digit. The original code used just to take the least
600     significant 8 bits of octal numbers (I think this is what early Perls used
601     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602     than 3 octal digits. */
603 nigel 77
604     case '0':
605     c -= '0';
606     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607     c = c * 8 + *(++ptr) - '0';
608 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 nigel 77 break;
610    
611 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
612     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613     treated as a data character. */
614 nigel 77
615     case 'x':
616 nigel 87 if (ptr[1] == '{')
617 nigel 77 {
618     const uschar *pt = ptr + 2;
619 nigel 87 int count = 0;
620    
621 nigel 77 c = 0;
622     while ((digitab[*pt] & ctype_xdigit) != 0)
623     {
624 nigel 87 register int cc = *pt++;
625     if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 nigel 77 count++;
627 nigel 87
628 ph10 97 #ifndef EBCDIC /* ASCII coding */
629 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 ph10 97 #else /* EBCDIC coding */
632 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 nigel 77 #endif
635     }
636 nigel 87
637 nigel 77 if (*pt == '}')
638     {
639 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 nigel 77 ptr = pt;
641     break;
642     }
643 nigel 87
644 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
645     recognize this construct; fall through to the normal \x handling. */
646     }
647    
648 nigel 87 /* Read just a single-byte hex-defined char */
649 nigel 77
650     c = 0;
651     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652     {
653     int cc; /* Some compilers don't like ++ */
654     cc = *(++ptr); /* in initializers */
655 ph10 97 #ifndef EBCDIC /* ASCII coding */
656 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 ph10 97 #else /* EBCDIC coding */
659 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
660     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661     #endif
662     }
663     break;
664    
665 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666     This coding is ASCII-specific, but then the whole concept of \cx is
667     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668 nigel 77
669     case 'c':
670     c = *(++ptr);
671     if (c == 0)
672     {
673     *errorcodeptr = ERR2;
674 ph10 213 break;
675 nigel 77 }
676    
677 ph10 97 #ifndef EBCDIC /* ASCII coding */
678 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
679     c ^= 0x40;
680 ph10 97 #else /* EBCDIC coding */
681 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
682     c ^= 0xC0;
683     #endif
684     break;
685    
686     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688     for Perl compatibility, it is a literal. This code looks a bit odd, but
689     there used to be some cases other than the default, and there may be again
690     in future, so I haven't "optimized" it. */
691    
692     default:
693     if ((options & PCRE_EXTRA) != 0) switch(c)
694     {
695     default:
696     *errorcodeptr = ERR3;
697     break;
698     }
699     break;
700     }
701     }
702    
703     *ptrptr = ptr;
704     return c;
705     }
706    
707    
708    
709     #ifdef SUPPORT_UCP
710     /*************************************************
711     * Handle \P and \p *
712     *************************************************/
713    
714     /* This function is called after \P or \p has been encountered, provided that
715     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716     pointing at the P or p. On exit, it is pointing at the final character of the
717     escape sequence.
718    
719     Argument:
720     ptrptr points to the pattern position pointer
721     negptr points to a boolean that is set TRUE for negation else FALSE
722 nigel 87 dptr points to an int that is set to the detailed property value
723 nigel 77 errorcodeptr points to the error code variable
724    
725 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
726 nigel 77 */
727    
728     static int
729 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 nigel 77 {
731     int c, i, bot, top;
732     const uschar *ptr = *ptrptr;
733 nigel 87 char name[32];
734 nigel 77
735     c = *(++ptr);
736     if (c == 0) goto ERROR_RETURN;
737    
738     *negptr = FALSE;
739    
740 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741     negation. */
742 nigel 77
743     if (c == '{')
744     {
745     if (ptr[1] == '^')
746     {
747     *negptr = TRUE;
748     ptr++;
749     }
750 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 nigel 77 {
752     c = *(++ptr);
753     if (c == 0) goto ERROR_RETURN;
754     if (c == '}') break;
755     name[i] = c;
756     }
757 nigel 87 if (c !='}') goto ERROR_RETURN;
758 nigel 77 name[i] = 0;
759     }
760    
761     /* Otherwise there is just one following character */
762    
763     else
764     {
765     name[0] = c;
766     name[1] = 0;
767     }
768    
769     *ptrptr = ptr;
770    
771     /* Search for a recognized property name using binary chop */
772    
773     bot = 0;
774     top = _pcre_utt_size;
775    
776     while (bot < top)
777     {
778 nigel 87 i = (bot + top) >> 1;
779 nigel 77 c = strcmp(name, _pcre_utt[i].name);
780 nigel 87 if (c == 0)
781     {
782     *dptr = _pcre_utt[i].value;
783     return _pcre_utt[i].type;
784     }
785 nigel 77 if (c > 0) bot = i + 1; else top = i;
786     }
787    
788     *errorcodeptr = ERR47;
789     *ptrptr = ptr;
790     return -1;
791    
792     ERROR_RETURN:
793     *errorcodeptr = ERR46;
794     *ptrptr = ptr;
795     return -1;
796     }
797     #endif
798    
799    
800    
801    
802     /*************************************************
803     * Check for counted repeat *
804     *************************************************/
805    
806     /* This function is called when a '{' is encountered in a place where it might
807     start a quantifier. It looks ahead to see if it really is a quantifier or not.
808     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809     where the ddds are digits.
810    
811     Arguments:
812     p pointer to the first char after '{'
813    
814     Returns: TRUE or FALSE
815     */
816    
817     static BOOL
818     is_counted_repeat(const uschar *p)
819     {
820     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821     while ((digitab[*p] & ctype_digit) != 0) p++;
822     if (*p == '}') return TRUE;
823    
824     if (*p++ != ',') return FALSE;
825     if (*p == '}') return TRUE;
826    
827     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828     while ((digitab[*p] & ctype_digit) != 0) p++;
829    
830     return (*p == '}');
831     }
832    
833    
834    
835     /*************************************************
836     * Read repeat counts *
837     *************************************************/
838    
839     /* Read an item of the form {n,m} and return the values. This is called only
840     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841     so the syntax is guaranteed to be correct, but we need to check the values.
842    
843     Arguments:
844     p pointer to first char after '{'
845     minp pointer to int for min
846     maxp pointer to int for max
847     returned as -1 if no max
848     errorcodeptr points to error code variable
849    
850     Returns: pointer to '}' on success;
851     current ptr on error, with errorcodeptr set non-zero
852     */
853    
854     static const uschar *
855     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856     {
857     int min = 0;
858     int max = -1;
859    
860 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
861     an integer overflow. */
862    
863 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 nigel 81 if (min < 0 || min > 65535)
865     {
866     *errorcodeptr = ERR5;
867     return p;
868     }
869 nigel 77
870 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
871     Also, max must not be less than min. */
872    
873 nigel 77 if (*p == '}') max = min; else
874     {
875     if (*(++p) != '}')
876     {
877     max = 0;
878     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 nigel 81 if (max < 0 || max > 65535)
880     {
881     *errorcodeptr = ERR5;
882     return p;
883     }
884 nigel 77 if (max < min)
885     {
886     *errorcodeptr = ERR4;
887     return p;
888     }
889     }
890     }
891    
892 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
893     '}'. */
894 nigel 77
895 nigel 81 *minp = min;
896     *maxp = max;
897 nigel 77 return p;
898     }
899    
900    
901    
902     /*************************************************
903 nigel 93 * Find forward referenced subpattern *
904 nigel 91 *************************************************/
905    
906 nigel 93 /* This function scans along a pattern's text looking for capturing
907     subpatterns, and counting them. If it finds a named pattern that matches the
908     name it is given, it returns its number. Alternatively, if the name is NULL, it
909     returns when it reaches a given numbered subpattern. This is used for forward
910     references to subpatterns. We know that if (?P< is encountered, the name will
911     be terminated by '>' because that is checked in the first pass.
912 nigel 91
913     Arguments:
914 nigel 93 ptr current position in the pattern
915     count current count of capturing parens so far encountered
916     name name to seek, or NULL if seeking a numbered subpattern
917     lorn name length, or subpattern number if name is NULL
918     xmode TRUE if we are in /x mode
919 nigel 91
920     Returns: the number of the named subpattern, or -1 if not found
921     */
922    
923     static int
924 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925     BOOL xmode)
926 nigel 91 {
927     const uschar *thisname;
928 nigel 93
929 nigel 91 for (; *ptr != 0; ptr++)
930     {
931 nigel 93 int term;
932    
933     /* Skip over backslashed characters and also entire \Q...\E */
934    
935     if (*ptr == '\\')
936     {
937     if (*(++ptr) == 0) return -1;
938     if (*ptr == 'Q') for (;;)
939     {
940     while (*(++ptr) != 0 && *ptr != '\\');
941     if (*ptr == 0) return -1;
942     if (*(++ptr) == 'E') break;
943     }
944     continue;
945     }
946    
947     /* Skip over character classes */
948    
949     if (*ptr == '[')
950     {
951     while (*(++ptr) != ']')
952     {
953 ph10 215 if (*ptr == 0) return -1;
954 nigel 93 if (*ptr == '\\')
955     {
956     if (*(++ptr) == 0) return -1;
957     if (*ptr == 'Q') for (;;)
958     {
959     while (*(++ptr) != 0 && *ptr != '\\');
960     if (*ptr == 0) return -1;
961     if (*(++ptr) == 'E') break;
962     }
963     continue;
964     }
965     }
966     continue;
967     }
968    
969     /* Skip comments in /x mode */
970    
971     if (xmode && *ptr == '#')
972     {
973     while (*(++ptr) != 0 && *ptr != '\n');
974     if (*ptr == 0) return -1;
975     continue;
976     }
977    
978     /* An opening parens must now be a real metacharacter */
979    
980 nigel 91 if (*ptr != '(') continue;
981 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
982 nigel 93 {
983     count++;
984     if (name == NULL && count == lorn) return count;
985     continue;
986     }
987    
988     ptr += 2;
989     if (*ptr == 'P') ptr++; /* Allow optional P */
990    
991     /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994     *ptr != '\'')
995     continue;
996    
997 nigel 91 count++;
998 nigel 93
999     if (name == NULL && count == lorn) return count;
1000     term = *ptr++;
1001     if (term == '<') term = '>';
1002 nigel 91 thisname = ptr;
1003 nigel 93 while (*ptr != term) ptr++;
1004     if (name != NULL && lorn == ptr - thisname &&
1005     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 nigel 91 return count;
1007     }
1008 nigel 93
1009 nigel 91 return -1;
1010     }
1011    
1012    
1013    
1014     /*************************************************
1015 nigel 77 * Find first significant op code *
1016     *************************************************/
1017    
1018     /* This is called by several functions that scan a compiled expression looking
1019     for a fixed first character, or an anchoring op code etc. It skips over things
1020     that do not influence this. For some calls, a change of option is important.
1021     For some calls, it makes sense to skip negative forward and all backward
1022     assertions, and also the \b assertion; for others it does not.
1023    
1024     Arguments:
1025     code pointer to the start of the group
1026     options pointer to external options
1027     optbit the option bit whose changing is significant, or
1028     zero if none are
1029     skipassert TRUE if certain assertions are to be skipped
1030    
1031     Returns: pointer to the first significant opcode
1032     */
1033    
1034     static const uschar*
1035     first_significant_code(const uschar *code, int *options, int optbit,
1036     BOOL skipassert)
1037     {
1038     for (;;)
1039     {
1040     switch ((int)*code)
1041     {
1042     case OP_OPT:
1043     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044     *options = (int)code[1];
1045     code += 2;
1046     break;
1047    
1048     case OP_ASSERT_NOT:
1049     case OP_ASSERTBACK:
1050     case OP_ASSERTBACK_NOT:
1051     if (!skipassert) return code;
1052     do code += GET(code, 1); while (*code == OP_ALT);
1053     code += _pcre_OP_lengths[*code];
1054     break;
1055    
1056     case OP_WORD_BOUNDARY:
1057     case OP_NOT_WORD_BOUNDARY:
1058     if (!skipassert) return code;
1059     /* Fall through */
1060    
1061     case OP_CALLOUT:
1062     case OP_CREF:
1063 nigel 93 case OP_RREF:
1064     case OP_DEF:
1065 nigel 77 code += _pcre_OP_lengths[*code];
1066     break;
1067    
1068     default:
1069     return code;
1070     }
1071     }
1072     /* Control never reaches here */
1073     }
1074    
1075    
1076    
1077    
1078     /*************************************************
1079     * Find the fixed length of a pattern *
1080     *************************************************/
1081    
1082     /* Scan a pattern and compute the fixed length of subject that will match it,
1083     if the length is fixed. This is needed for dealing with backward assertions.
1084     In UTF8 mode, the result is in characters rather than bytes.
1085    
1086     Arguments:
1087     code points to the start of the pattern (the bracket)
1088     options the compiling options
1089    
1090     Returns: the fixed length, or -1 if there is no fixed length,
1091     or -2 if \C was encountered
1092     */
1093    
1094     static int
1095     find_fixedlength(uschar *code, int options)
1096     {
1097     int length = -1;
1098    
1099     register int branchlength = 0;
1100     register uschar *cc = code + 1 + LINK_SIZE;
1101    
1102     /* Scan along the opcodes for this branch. If we get to the end of the
1103     branch, check the length against that of the other branches. */
1104    
1105     for (;;)
1106     {
1107     int d;
1108     register int op = *cc;
1109    
1110     switch (op)
1111     {
1112 nigel 93 case OP_CBRA:
1113 nigel 77 case OP_BRA:
1114     case OP_ONCE:
1115     case OP_COND:
1116 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117 nigel 77 if (d < 0) return d;
1118     branchlength += d;
1119     do cc += GET(cc, 1); while (*cc == OP_ALT);
1120     cc += 1 + LINK_SIZE;
1121     break;
1122    
1123     /* Reached end of a branch; if it's a ket it is the end of a nested
1124     call. If it's ALT it is an alternation in a nested call. If it is
1125     END it's the end of the outer call. All can be handled by the same code. */
1126    
1127     case OP_ALT:
1128     case OP_KET:
1129     case OP_KETRMAX:
1130     case OP_KETRMIN:
1131     case OP_END:
1132     if (length < 0) length = branchlength;
1133     else if (length != branchlength) return -1;
1134     if (*cc != OP_ALT) return length;
1135     cc += 1 + LINK_SIZE;
1136     branchlength = 0;
1137     break;
1138    
1139     /* Skip over assertive subpatterns */
1140    
1141     case OP_ASSERT:
1142     case OP_ASSERT_NOT:
1143     case OP_ASSERTBACK:
1144     case OP_ASSERTBACK_NOT:
1145     do cc += GET(cc, 1); while (*cc == OP_ALT);
1146     /* Fall through */
1147    
1148     /* Skip over things that don't match chars */
1149    
1150     case OP_REVERSE:
1151     case OP_CREF:
1152 nigel 93 case OP_RREF:
1153     case OP_DEF:
1154 nigel 77 case OP_OPT:
1155     case OP_CALLOUT:
1156     case OP_SOD:
1157     case OP_SOM:
1158     case OP_EOD:
1159     case OP_EODN:
1160     case OP_CIRC:
1161     case OP_DOLL:
1162     case OP_NOT_WORD_BOUNDARY:
1163     case OP_WORD_BOUNDARY:
1164     cc += _pcre_OP_lengths[*cc];
1165     break;
1166    
1167     /* Handle literal characters */
1168    
1169     case OP_CHAR:
1170     case OP_CHARNC:
1171 nigel 91 case OP_NOT:
1172 nigel 77 branchlength++;
1173     cc += 2;
1174     #ifdef SUPPORT_UTF8
1175     if ((options & PCRE_UTF8) != 0)
1176     {
1177     while ((*cc & 0xc0) == 0x80) cc++;
1178     }
1179     #endif
1180     break;
1181    
1182     /* Handle exact repetitions. The count is already in characters, but we
1183     need to skip over a multibyte character in UTF8 mode. */
1184    
1185     case OP_EXACT:
1186     branchlength += GET2(cc,1);
1187     cc += 4;
1188     #ifdef SUPPORT_UTF8
1189     if ((options & PCRE_UTF8) != 0)
1190     {
1191     while((*cc & 0x80) == 0x80) cc++;
1192     }
1193     #endif
1194     break;
1195    
1196     case OP_TYPEEXACT:
1197     branchlength += GET2(cc,1);
1198     cc += 4;
1199     break;
1200    
1201     /* Handle single-char matchers */
1202    
1203     case OP_PROP:
1204     case OP_NOTPROP:
1205 nigel 87 cc += 2;
1206 nigel 77 /* Fall through */
1207    
1208     case OP_NOT_DIGIT:
1209     case OP_DIGIT:
1210     case OP_NOT_WHITESPACE:
1211     case OP_WHITESPACE:
1212     case OP_NOT_WORDCHAR:
1213     case OP_WORDCHAR:
1214     case OP_ANY:
1215     branchlength++;
1216     cc++;
1217     break;
1218    
1219     /* The single-byte matcher isn't allowed */
1220    
1221     case OP_ANYBYTE:
1222     return -2;
1223    
1224     /* Check a class for variable quantification */
1225    
1226     #ifdef SUPPORT_UTF8
1227     case OP_XCLASS:
1228     cc += GET(cc, 1) - 33;
1229     /* Fall through */
1230     #endif
1231    
1232     case OP_CLASS:
1233     case OP_NCLASS:
1234     cc += 33;
1235    
1236     switch (*cc)
1237     {
1238     case OP_CRSTAR:
1239     case OP_CRMINSTAR:
1240     case OP_CRQUERY:
1241     case OP_CRMINQUERY:
1242     return -1;
1243    
1244     case OP_CRRANGE:
1245     case OP_CRMINRANGE:
1246     if (GET2(cc,1) != GET2(cc,3)) return -1;
1247     branchlength += GET2(cc,1);
1248     cc += 5;
1249     break;
1250    
1251     default:
1252     branchlength++;
1253     }
1254     break;
1255    
1256     /* Anything else is variable length */
1257    
1258     default:
1259     return -1;
1260     }
1261     }
1262     /* Control never gets here */
1263     }
1264    
1265    
1266    
1267    
1268     /*************************************************
1269     * Scan compiled regex for numbered bracket *
1270     *************************************************/
1271    
1272     /* This little function scans through a compiled pattern until it finds a
1273     capturing bracket with the given number.
1274    
1275     Arguments:
1276     code points to start of expression
1277     utf8 TRUE in UTF-8 mode
1278     number the required bracket number
1279    
1280     Returns: pointer to the opcode for the bracket, or NULL if not found
1281     */
1282    
1283     static const uschar *
1284     find_bracket(const uschar *code, BOOL utf8, int number)
1285     {
1286     for (;;)
1287     {
1288     register int c = *code;
1289     if (c == OP_END) return NULL;
1290 nigel 91
1291     /* XCLASS is used for classes that cannot be represented just by a bit
1292     map. This includes negated single high-valued characters. The length in
1293     the table is zero; the actual length is stored in the compiled code. */
1294    
1295     if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297 nigel 93 /* Handle capturing bracket */
1298 nigel 91
1299 nigel 93 else if (c == OP_CBRA)
1300 nigel 77 {
1301 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1302 nigel 77 if (n == number) return (uschar *)code;
1303 nigel 93 code += _pcre_OP_lengths[c];
1304 nigel 77 }
1305 nigel 91
1306 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307     a multi-byte character. The length in the table is a minimum, so we have to
1308     arrange to skip the extra bytes. */
1309 nigel 91
1310 nigel 77 else
1311     {
1312     code += _pcre_OP_lengths[c];
1313 ph10 107 #ifdef SUPPORT_UTF8
1314 nigel 77 if (utf8) switch(c)
1315     {
1316     case OP_CHAR:
1317     case OP_CHARNC:
1318     case OP_EXACT:
1319     case OP_UPTO:
1320     case OP_MINUPTO:
1321 nigel 93 case OP_POSUPTO:
1322 nigel 77 case OP_STAR:
1323     case OP_MINSTAR:
1324 nigel 93 case OP_POSSTAR:
1325 nigel 77 case OP_PLUS:
1326     case OP_MINPLUS:
1327 nigel 93 case OP_POSPLUS:
1328 nigel 77 case OP_QUERY:
1329     case OP_MINQUERY:
1330 nigel 93 case OP_POSQUERY:
1331     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332 nigel 77 break;
1333     }
1334 ph10 111 #endif
1335 nigel 77 }
1336     }
1337     }
1338    
1339    
1340    
1341     /*************************************************
1342     * Scan compiled regex for recursion reference *
1343     *************************************************/
1344    
1345     /* This little function scans through a compiled pattern until it finds an
1346     instance of OP_RECURSE.
1347    
1348     Arguments:
1349     code points to start of expression
1350     utf8 TRUE in UTF-8 mode
1351    
1352     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1353     */
1354    
1355     static const uschar *
1356     find_recurse(const uschar *code, BOOL utf8)
1357     {
1358     for (;;)
1359     {
1360     register int c = *code;
1361     if (c == OP_END) return NULL;
1362 nigel 91 if (c == OP_RECURSE) return code;
1363    
1364     /* XCLASS is used for classes that cannot be represented just by a bit
1365     map. This includes negated single high-valued characters. The length in
1366     the table is zero; the actual length is stored in the compiled code. */
1367    
1368     if (c == OP_XCLASS) code += GET(code, 1);
1369    
1370     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371     that are followed by a character may be followed by a multi-byte character.
1372 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1373     bytes. */
1374 nigel 91
1375 nigel 77 else
1376     {
1377     code += _pcre_OP_lengths[c];
1378 ph10 107 #ifdef SUPPORT_UTF8
1379 nigel 77 if (utf8) switch(c)
1380     {
1381     case OP_CHAR:
1382     case OP_CHARNC:
1383     case OP_EXACT:
1384     case OP_UPTO:
1385     case OP_MINUPTO:
1386 nigel 93 case OP_POSUPTO:
1387 nigel 77 case OP_STAR:
1388     case OP_MINSTAR:
1389 nigel 93 case OP_POSSTAR:
1390 nigel 77 case OP_PLUS:
1391     case OP_MINPLUS:
1392 nigel 93 case OP_POSPLUS:
1393 nigel 77 case OP_QUERY:
1394     case OP_MINQUERY:
1395 nigel 93 case OP_POSQUERY:
1396     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1397 nigel 77 break;
1398     }
1399 ph10 111 #endif
1400 nigel 77 }
1401     }
1402     }
1403    
1404    
1405    
1406     /*************************************************
1407     * Scan compiled branch for non-emptiness *
1408     *************************************************/
1409    
1410     /* This function scans through a branch of a compiled pattern to see whether it
1411 nigel 93 can match the empty string or not. It is called from could_be_empty()
1412     below and from compile_branch() when checking for an unlimited repeat of a
1413     group that can match nothing. Note that first_significant_code() skips over
1414     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415     struck an inner bracket whose current branch will already have been scanned.
1416 nigel 77
1417     Arguments:
1418     code points to start of search
1419     endcode points to where to stop
1420     utf8 TRUE if in UTF8 mode
1421    
1422     Returns: TRUE if what is matched could be empty
1423     */
1424    
1425     static BOOL
1426     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427     {
1428     register int c;
1429 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430 nigel 77 code < endcode;
1431     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432     {
1433     const uschar *ccode;
1434    
1435     c = *code;
1436 ph10 172
1437 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1438 nigel 77
1439 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440     {
1441 ph10 172 code += _pcre_OP_lengths[c];
1442 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1443     c = *code;
1444     continue;
1445     }
1446    
1447     /* For other groups, scan the branches. */
1448 ph10 172
1449 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450 nigel 77 {
1451     BOOL empty_branch;
1452     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1453    
1454     /* Scan a closed bracket */
1455    
1456     empty_branch = FALSE;
1457     do
1458     {
1459     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1460     empty_branch = TRUE;
1461     code += GET(code, 1);
1462     }
1463     while (*code == OP_ALT);
1464     if (!empty_branch) return FALSE; /* All branches are non-empty */
1465 ph10 172 c = *code;
1466 nigel 93 continue;
1467 nigel 77 }
1468    
1469 nigel 93 /* Handle the other opcodes */
1470    
1471     switch (c)
1472 nigel 77 {
1473     /* Check for quantifiers after a class */
1474    
1475     #ifdef SUPPORT_UTF8
1476     case OP_XCLASS:
1477     ccode = code + GET(code, 1);
1478     goto CHECK_CLASS_REPEAT;
1479     #endif
1480    
1481     case OP_CLASS:
1482     case OP_NCLASS:
1483     ccode = code + 33;
1484    
1485     #ifdef SUPPORT_UTF8
1486     CHECK_CLASS_REPEAT:
1487     #endif
1488    
1489     switch (*ccode)
1490     {
1491     case OP_CRSTAR: /* These could be empty; continue */
1492     case OP_CRMINSTAR:
1493     case OP_CRQUERY:
1494     case OP_CRMINQUERY:
1495     break;
1496    
1497     default: /* Non-repeat => class must match */
1498     case OP_CRPLUS: /* These repeats aren't empty */
1499     case OP_CRMINPLUS:
1500     return FALSE;
1501    
1502     case OP_CRRANGE:
1503     case OP_CRMINRANGE:
1504     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1505     break;
1506     }
1507     break;
1508    
1509     /* Opcodes that must match a character */
1510    
1511     case OP_PROP:
1512     case OP_NOTPROP:
1513     case OP_EXTUNI:
1514     case OP_NOT_DIGIT:
1515     case OP_DIGIT:
1516     case OP_NOT_WHITESPACE:
1517     case OP_WHITESPACE:
1518     case OP_NOT_WORDCHAR:
1519     case OP_WORDCHAR:
1520     case OP_ANY:
1521     case OP_ANYBYTE:
1522     case OP_CHAR:
1523     case OP_CHARNC:
1524     case OP_NOT:
1525     case OP_PLUS:
1526     case OP_MINPLUS:
1527 nigel 93 case OP_POSPLUS:
1528 nigel 77 case OP_EXACT:
1529     case OP_NOTPLUS:
1530     case OP_NOTMINPLUS:
1531 nigel 93 case OP_NOTPOSPLUS:
1532 nigel 77 case OP_NOTEXACT:
1533     case OP_TYPEPLUS:
1534     case OP_TYPEMINPLUS:
1535 nigel 93 case OP_TYPEPOSPLUS:
1536 nigel 77 case OP_TYPEEXACT:
1537     return FALSE;
1538    
1539     /* End of branch */
1540    
1541     case OP_KET:
1542     case OP_KETRMAX:
1543     case OP_KETRMIN:
1544     case OP_ALT:
1545     return TRUE;
1546    
1547 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1548     MINUPTO, and POSUPTO may be followed by a multibyte character */
1549 nigel 77
1550     #ifdef SUPPORT_UTF8
1551     case OP_STAR:
1552     case OP_MINSTAR:
1553 nigel 93 case OP_POSSTAR:
1554 nigel 77 case OP_QUERY:
1555     case OP_MINQUERY:
1556 nigel 93 case OP_POSQUERY:
1557 nigel 77 case OP_UPTO:
1558     case OP_MINUPTO:
1559 nigel 93 case OP_POSUPTO:
1560 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1561     break;
1562     #endif
1563     }
1564     }
1565    
1566     return TRUE;
1567     }
1568    
1569    
1570    
1571     /*************************************************
1572     * Scan compiled regex for non-emptiness *
1573     *************************************************/
1574    
1575     /* This function is called to check for left recursive calls. We want to check
1576     the current branch of the current pattern to see if it could match the empty
1577     string. If it could, we must look outwards for branches at other levels,
1578     stopping when we pass beyond the bracket which is the subject of the recursion.
1579    
1580     Arguments:
1581     code points to start of the recursion
1582     endcode points to where to stop (current RECURSE item)
1583     bcptr points to the chain of current (unclosed) branch starts
1584     utf8 TRUE if in UTF-8 mode
1585    
1586     Returns: TRUE if what is matched could be empty
1587     */
1588    
1589     static BOOL
1590     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1591     BOOL utf8)
1592     {
1593     while (bcptr != NULL && bcptr->current >= code)
1594     {
1595     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1596     bcptr = bcptr->outer;
1597     }
1598     return TRUE;
1599     }
1600    
1601    
1602    
1603     /*************************************************
1604     * Check for POSIX class syntax *
1605     *************************************************/
1606    
1607     /* This function is called when the sequence "[:" or "[." or "[=" is
1608     encountered in a character class. It checks whether this is followed by an
1609     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1610     ".]" or "=]".
1611    
1612     Argument:
1613     ptr pointer to the initial [
1614     endptr where to return the end pointer
1615     cd pointer to compile data
1616    
1617     Returns: TRUE or FALSE
1618     */
1619    
1620     static BOOL
1621     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1622     {
1623     int terminator; /* Don't combine these lines; the Solaris cc */
1624     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1625     if (*(++ptr) == '^') ptr++;
1626     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1627     if (*ptr == terminator && ptr[1] == ']')
1628     {
1629     *endptr = ptr;
1630     return TRUE;
1631     }
1632     return FALSE;
1633     }
1634    
1635    
1636    
1637    
1638     /*************************************************
1639     * Check POSIX class name *
1640     *************************************************/
1641    
1642     /* This function is called to check the name given in a POSIX-style class entry
1643     such as [:alnum:].
1644    
1645     Arguments:
1646     ptr points to the first letter
1647     len the length of the name
1648    
1649     Returns: a value representing the name, or -1 if unknown
1650     */
1651    
1652     static int
1653     check_posix_name(const uschar *ptr, int len)
1654     {
1655     register int yield = 0;
1656     while (posix_name_lengths[yield] != 0)
1657     {
1658     if (len == posix_name_lengths[yield] &&
1659     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1660     yield++;
1661     }
1662     return -1;
1663     }
1664    
1665    
1666     /*************************************************
1667     * Adjust OP_RECURSE items in repeated group *
1668     *************************************************/
1669    
1670     /* OP_RECURSE items contain an offset from the start of the regex to the group
1671     that is referenced. This means that groups can be replicated for fixed
1672     repetition simply by copying (because the recursion is allowed to refer to
1673     earlier groups that are outside the current group). However, when a group is
1674     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1675     it, after it has been compiled. This means that any OP_RECURSE items within it
1676     that refer to the group itself or any contained groups have to have their
1677 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1678     the partially compiled regex must be temporarily terminated with OP_END.
1679 nigel 77
1680 nigel 93 This function has been extended with the possibility of forward references for
1681     recursions and subroutine calls. It must also check the list of such references
1682     for the group we are dealing with. If it finds that one of the recursions in
1683     the current group is on this list, it adjusts the offset in the list, not the
1684     value in the reference (which is a group number).
1685    
1686 nigel 77 Arguments:
1687     group points to the start of the group
1688     adjust the amount by which the group is to be moved
1689     utf8 TRUE in UTF-8 mode
1690     cd contains pointers to tables etc.
1691 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1692 nigel 77
1693     Returns: nothing
1694     */
1695    
1696     static void
1697 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1698     uschar *save_hwm)
1699 nigel 77 {
1700     uschar *ptr = group;
1701     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1702     {
1703 nigel 93 int offset;
1704     uschar *hc;
1705    
1706     /* See if this recursion is on the forward reference list. If so, adjust the
1707     reference. */
1708    
1709     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1710     {
1711     offset = GET(hc, 0);
1712     if (cd->start_code + offset == ptr + 1)
1713     {
1714     PUT(hc, 0, offset + adjust);
1715     break;
1716     }
1717     }
1718    
1719     /* Otherwise, adjust the recursion offset if it's after the start of this
1720     group. */
1721    
1722     if (hc >= cd->hwm)
1723     {
1724     offset = GET(ptr, 1);
1725     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1726     }
1727    
1728 nigel 77 ptr += 1 + LINK_SIZE;
1729     }
1730     }
1731    
1732    
1733    
1734     /*************************************************
1735     * Insert an automatic callout point *
1736     *************************************************/
1737    
1738     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1739     callout points before each pattern item.
1740    
1741     Arguments:
1742     code current code pointer
1743     ptr current pattern pointer
1744     cd pointers to tables etc
1745    
1746     Returns: new code pointer
1747     */
1748    
1749     static uschar *
1750     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1751     {
1752     *code++ = OP_CALLOUT;
1753     *code++ = 255;
1754     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1755     PUT(code, LINK_SIZE, 0); /* Default length */
1756     return code + 2*LINK_SIZE;
1757     }
1758    
1759    
1760    
1761     /*************************************************
1762     * Complete a callout item *
1763     *************************************************/
1764    
1765     /* A callout item contains the length of the next item in the pattern, which
1766     we can't fill in till after we have reached the relevant point. This is used
1767     for both automatic and manual callouts.
1768    
1769     Arguments:
1770     previous_callout points to previous callout item
1771     ptr current pattern pointer
1772     cd pointers to tables etc
1773    
1774     Returns: nothing
1775     */
1776    
1777     static void
1778     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1779     {
1780     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1781     PUT(previous_callout, 2 + LINK_SIZE, length);
1782     }
1783    
1784    
1785    
1786     #ifdef SUPPORT_UCP
1787     /*************************************************
1788     * Get othercase range *
1789     *************************************************/
1790    
1791     /* This function is passed the start and end of a class range, in UTF-8 mode
1792     with UCP support. It searches up the characters, looking for internal ranges of
1793     characters in the "other" case. Each call returns the next one, updating the
1794     start address.
1795    
1796     Arguments:
1797     cptr points to starting character value; updated
1798     d end value
1799     ocptr where to put start of othercase range
1800     odptr where to put end of othercase range
1801    
1802     Yield: TRUE when range returned; FALSE when no more
1803     */
1804    
1805     static BOOL
1806 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1807     unsigned int *odptr)
1808 nigel 77 {
1809 nigel 93 unsigned int c, othercase, next;
1810 nigel 77
1811     for (c = *cptr; c <= d; c++)
1812 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1813 nigel 77
1814     if (c > d) return FALSE;
1815    
1816     *ocptr = othercase;
1817     next = othercase + 1;
1818    
1819     for (++c; c <= d; c++)
1820     {
1821 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1822 nigel 77 next++;
1823     }
1824    
1825     *odptr = next - 1;
1826     *cptr = c;
1827    
1828     return TRUE;
1829     }
1830     #endif /* SUPPORT_UCP */
1831    
1832    
1833 nigel 93
1834 nigel 77 /*************************************************
1835 nigel 93 * Check if auto-possessifying is possible *
1836     *************************************************/
1837    
1838     /* This function is called for unlimited repeats of certain items, to see
1839     whether the next thing could possibly match the repeated item. If not, it makes
1840     sense to automatically possessify the repeated item.
1841    
1842     Arguments:
1843     op_code the repeated op code
1844     this data for this item, depends on the opcode
1845     utf8 TRUE in UTF-8 mode
1846     utf8_char used for utf8 character bytes, NULL if not relevant
1847     ptr next character in pattern
1848     options options bits
1849     cd contains pointers to tables etc.
1850    
1851     Returns: TRUE if possessifying is wanted
1852     */
1853    
1854     static BOOL
1855     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1856     const uschar *ptr, int options, compile_data *cd)
1857     {
1858     int next;
1859    
1860     /* Skip whitespace and comments in extended mode */
1861    
1862     if ((options & PCRE_EXTENDED) != 0)
1863     {
1864     for (;;)
1865     {
1866     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867     if (*ptr == '#')
1868     {
1869     while (*(++ptr) != 0)
1870     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871     }
1872     else break;
1873     }
1874     }
1875    
1876     /* If the next item is one that we can handle, get its value. A non-negative
1877     value is a character, a negative value is an escape value. */
1878    
1879     if (*ptr == '\\')
1880     {
1881     int temperrorcode = 0;
1882     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1883     if (temperrorcode != 0) return FALSE;
1884     ptr++; /* Point after the escape sequence */
1885     }
1886    
1887     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1888     {
1889     #ifdef SUPPORT_UTF8
1890     if (utf8) { GETCHARINC(next, ptr); } else
1891     #endif
1892     next = *ptr++;
1893     }
1894    
1895     else return FALSE;
1896    
1897     /* Skip whitespace and comments in extended mode */
1898    
1899     if ((options & PCRE_EXTENDED) != 0)
1900     {
1901     for (;;)
1902     {
1903     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1904     if (*ptr == '#')
1905     {
1906     while (*(++ptr) != 0)
1907     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1908     }
1909     else break;
1910     }
1911     }
1912    
1913     /* If the next thing is itself optional, we have to give up. */
1914    
1915     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1916     return FALSE;
1917    
1918     /* Now compare the next item with the previous opcode. If the previous is a
1919     positive single character match, "item" either contains the character or, if
1920     "item" is greater than 127 in utf8 mode, the character's bytes are in
1921     utf8_char. */
1922    
1923    
1924     /* Handle cases when the next item is a character. */
1925    
1926     if (next >= 0) switch(op_code)
1927     {
1928     case OP_CHAR:
1929     #ifdef SUPPORT_UTF8
1930     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1931     #endif
1932     return item != next;
1933    
1934     /* For CHARNC (caseless character) we must check the other case. If we have
1935     Unicode property support, we can use it to test the other case of
1936     high-valued characters. */
1937    
1938     case OP_CHARNC:
1939     #ifdef SUPPORT_UTF8
1940     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941     #endif
1942     if (item == next) return FALSE;
1943     #ifdef SUPPORT_UTF8
1944     if (utf8)
1945     {
1946     unsigned int othercase;
1947     if (next < 128) othercase = cd->fcc[next]; else
1948     #ifdef SUPPORT_UCP
1949     othercase = _pcre_ucp_othercase((unsigned int)next);
1950     #else
1951     othercase = NOTACHAR;
1952     #endif
1953     return (unsigned int)item != othercase;
1954     }
1955     else
1956     #endif /* SUPPORT_UTF8 */
1957     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1958    
1959     /* For OP_NOT, "item" must be a single-byte character. */
1960    
1961     case OP_NOT:
1962     if (next < 0) return FALSE; /* Not a character */
1963     if (item == next) return TRUE;
1964     if ((options & PCRE_CASELESS) == 0) return FALSE;
1965     #ifdef SUPPORT_UTF8
1966     if (utf8)
1967     {
1968     unsigned int othercase;
1969     if (next < 128) othercase = cd->fcc[next]; else
1970     #ifdef SUPPORT_UCP
1971     othercase = _pcre_ucp_othercase(next);
1972     #else
1973     othercase = NOTACHAR;
1974     #endif
1975     return (unsigned int)item == othercase;
1976     }
1977     else
1978     #endif /* SUPPORT_UTF8 */
1979     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1980    
1981     case OP_DIGIT:
1982     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1983    
1984     case OP_NOT_DIGIT:
1985     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1986    
1987     case OP_WHITESPACE:
1988     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1989    
1990     case OP_NOT_WHITESPACE:
1991     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1992    
1993     case OP_WORDCHAR:
1994     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1995    
1996     case OP_NOT_WORDCHAR:
1997     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1998    
1999 ph10 180 case OP_HSPACE:
2000     case OP_NOT_HSPACE:
2001     switch(next)
2002     {
2003     case 0x09:
2004     case 0x20:
2005     case 0xa0:
2006     case 0x1680:
2007     case 0x180e:
2008     case 0x2000:
2009     case 0x2001:
2010     case 0x2002:
2011     case 0x2003:
2012     case 0x2004:
2013     case 0x2005:
2014     case 0x2006:
2015     case 0x2007:
2016     case 0x2008:
2017     case 0x2009:
2018     case 0x200A:
2019     case 0x202f:
2020     case 0x205f:
2021     case 0x3000:
2022     return op_code != OP_HSPACE;
2023     default:
2024     return op_code == OP_HSPACE;
2025     }
2026    
2027     case OP_VSPACE:
2028     case OP_NOT_VSPACE:
2029     switch(next)
2030     {
2031     case 0x0a:
2032     case 0x0b:
2033     case 0x0c:
2034     case 0x0d:
2035     case 0x85:
2036     case 0x2028:
2037     case 0x2029:
2038     return op_code != OP_VSPACE;
2039     default:
2040     return op_code == OP_VSPACE;
2041     }
2042    
2043 nigel 93 default:
2044     return FALSE;
2045     }
2046    
2047    
2048     /* Handle the case when the next item is \d, \s, etc. */
2049    
2050     switch(op_code)
2051     {
2052     case OP_CHAR:
2053     case OP_CHARNC:
2054     #ifdef SUPPORT_UTF8
2055     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2056     #endif
2057     switch(-next)
2058     {
2059     case ESC_d:
2060     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2061    
2062     case ESC_D:
2063     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2064    
2065     case ESC_s:
2066     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2067    
2068     case ESC_S:
2069     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2070    
2071     case ESC_w:
2072     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2073    
2074     case ESC_W:
2075     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2076 ph10 182
2077 ph10 180 case ESC_h:
2078     case ESC_H:
2079     switch(item)
2080     {
2081     case 0x09:
2082     case 0x20:
2083     case 0xa0:
2084     case 0x1680:
2085     case 0x180e:
2086     case 0x2000:
2087     case 0x2001:
2088     case 0x2002:
2089     case 0x2003:
2090     case 0x2004:
2091     case 0x2005:
2092     case 0x2006:
2093     case 0x2007:
2094     case 0x2008:
2095     case 0x2009:
2096     case 0x200A:
2097     case 0x202f:
2098     case 0x205f:
2099     case 0x3000:
2100     return -next != ESC_h;
2101     default:
2102     return -next == ESC_h;
2103 ph10 182 }
2104    
2105 ph10 180 case ESC_v:
2106     case ESC_V:
2107     switch(item)
2108     {
2109     case 0x0a:
2110     case 0x0b:
2111     case 0x0c:
2112     case 0x0d:
2113     case 0x85:
2114     case 0x2028:
2115     case 0x2029:
2116     return -next != ESC_v;
2117     default:
2118     return -next == ESC_v;
2119 ph10 182 }
2120 nigel 93
2121     default:
2122     return FALSE;
2123     }
2124    
2125     case OP_DIGIT:
2126 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2127     next == -ESC_h || next == -ESC_v;
2128 nigel 93
2129     case OP_NOT_DIGIT:
2130     return next == -ESC_d;
2131    
2132     case OP_WHITESPACE:
2133     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2134    
2135     case OP_NOT_WHITESPACE:
2136 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2137 nigel 93
2138 ph10 180 case OP_HSPACE:
2139     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2140    
2141     case OP_NOT_HSPACE:
2142     return next == -ESC_h;
2143 ph10 182
2144 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2145 ph10 182 case OP_VSPACE:
2146 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2147    
2148     case OP_NOT_VSPACE:
2149 ph10 182 return next == -ESC_v;
2150 ph10 180
2151 nigel 93 case OP_WORDCHAR:
2152 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2153 nigel 93
2154     case OP_NOT_WORDCHAR:
2155     return next == -ESC_w || next == -ESC_d;
2156 ph10 182
2157 nigel 93 default:
2158     return FALSE;
2159     }
2160    
2161     /* Control does not reach here */
2162     }
2163    
2164    
2165    
2166     /*************************************************
2167 nigel 77 * Compile one branch *
2168     *************************************************/
2169    
2170 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2171 nigel 77 changed during the branch, the pointer is used to change the external options
2172 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2173     to find out the amount of memory needed, as well as during the real compile
2174     phase. The value of lengthptr distinguishes the two phases.
2175 nigel 77
2176     Arguments:
2177     optionsptr pointer to the option bits
2178     codeptr points to the pointer to the current code point
2179     ptrptr points to the current pattern pointer
2180     errorcodeptr points to error code variable
2181     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2182     reqbyteptr set to the last literal character required, else < 0
2183     bcptr points to current branch chain
2184     cd contains pointers to tables etc.
2185 nigel 93 lengthptr NULL during the real compile phase
2186     points to length accumulator during pre-compile phase
2187 nigel 77
2188     Returns: TRUE on success
2189     FALSE, with *errorcodeptr set non-zero on error
2190     */
2191    
2192     static BOOL
2193 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2194     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2195     compile_data *cd, int *lengthptr)
2196 nigel 77 {
2197     int repeat_type, op_type;
2198     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2199     int bravalue = 0;
2200     int greedy_default, greedy_non_default;
2201     int firstbyte, reqbyte;
2202     int zeroreqbyte, zerofirstbyte;
2203     int req_caseopt, reqvary, tempreqvary;
2204     int options = *optionsptr;
2205     int after_manual_callout = 0;
2206 nigel 93 int length_prevgroup = 0;
2207 nigel 77 register int c;
2208     register uschar *code = *codeptr;
2209 nigel 93 uschar *last_code = code;
2210     uschar *orig_code = code;
2211 nigel 77 uschar *tempcode;
2212     BOOL inescq = FALSE;
2213     BOOL groupsetfirstbyte = FALSE;
2214     const uschar *ptr = *ptrptr;
2215     const uschar *tempptr;
2216     uschar *previous = NULL;
2217     uschar *previous_callout = NULL;
2218 nigel 93 uschar *save_hwm = NULL;
2219 nigel 77 uschar classbits[32];
2220    
2221     #ifdef SUPPORT_UTF8
2222     BOOL class_utf8;
2223     BOOL utf8 = (options & PCRE_UTF8) != 0;
2224     uschar *class_utf8data;
2225     uschar utf8_char[6];
2226     #else
2227     BOOL utf8 = FALSE;
2228 nigel 93 uschar *utf8_char = NULL;
2229 nigel 77 #endif
2230    
2231 nigel 93 #ifdef DEBUG
2232     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2233     #endif
2234    
2235 nigel 77 /* Set up the default and non-default settings for greediness */
2236    
2237     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2238     greedy_non_default = greedy_default ^ 1;
2239    
2240     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2241     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2242     matches a non-fixed char first char; reqbyte just remains unset if we never
2243     find one.
2244    
2245     When we hit a repeat whose minimum is zero, we may have to adjust these values
2246     to take the zero repeat into account. This is implemented by setting them to
2247     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2248     item types that can be repeated set these backoff variables appropriately. */
2249    
2250     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2251    
2252     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2253     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2254     value > 255. It is added into the firstbyte or reqbyte variables to record the
2255     case status of the value. This is used only for ASCII characters. */
2256    
2257     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2258    
2259     /* Switch on next character until the end of the branch */
2260    
2261     for (;; ptr++)
2262     {
2263     BOOL negate_class;
2264     BOOL possessive_quantifier;
2265     BOOL is_quantifier;
2266 nigel 93 BOOL is_recurse;
2267 ph10 180 BOOL reset_bracount;
2268 nigel 77 int class_charcount;
2269     int class_lastchar;
2270     int newoptions;
2271     int recno;
2272 ph10 172 int refsign;
2273 nigel 77 int skipbytes;
2274     int subreqbyte;
2275     int subfirstbyte;
2276 nigel 93 int terminator;
2277 nigel 77 int mclength;
2278     uschar mcbuffer[8];
2279    
2280 nigel 93 /* Get next byte in the pattern */
2281 nigel 77
2282     c = *ptr;
2283    
2284 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2285     previous cycle of this loop. */
2286    
2287     if (lengthptr != NULL)
2288     {
2289     #ifdef DEBUG
2290     if (code > cd->hwm) cd->hwm = code; /* High water info */
2291     #endif
2292     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2293     {
2294     *errorcodeptr = ERR52;
2295     goto FAILED;
2296     }
2297    
2298     /* There is at least one situation where code goes backwards: this is the
2299     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2300     the class is simply eliminated. However, it is created first, so we have to
2301     allow memory for it. Therefore, don't ever reduce the length at this point.
2302     */
2303    
2304     if (code < last_code) code = last_code;
2305 ph10 202
2306     /* Paranoid check for integer overflow */
2307    
2308     if (OFLOW_MAX - *lengthptr < code - last_code)
2309     {
2310     *errorcodeptr = ERR20;
2311     goto FAILED;
2312     }
2313    
2314 nigel 93 *lengthptr += code - last_code;
2315     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2316    
2317     /* If "previous" is set and it is not at the start of the work space, move
2318     it back to there, in order to avoid filling up the work space. Otherwise,
2319     if "previous" is NULL, reset the current code pointer to the start. */
2320    
2321     if (previous != NULL)
2322     {
2323     if (previous > orig_code)
2324     {
2325     memmove(orig_code, previous, code - previous);
2326     code -= previous - orig_code;
2327     previous = orig_code;
2328     }
2329     }
2330     else code = orig_code;
2331    
2332     /* Remember where this code item starts so we can pick up the length
2333     next time round. */
2334    
2335     last_code = code;
2336     }
2337    
2338     /* In the real compile phase, just check the workspace used by the forward
2339     reference list. */
2340    
2341     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2342     {
2343     *errorcodeptr = ERR52;
2344     goto FAILED;
2345     }
2346    
2347 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2348    
2349     if (inescq && c != 0)
2350     {
2351     if (c == '\\' && ptr[1] == 'E')
2352     {
2353     inescq = FALSE;
2354     ptr++;
2355     continue;
2356     }
2357     else
2358     {
2359     if (previous_callout != NULL)
2360     {
2361 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2362     complete_callout(previous_callout, ptr, cd);
2363 nigel 77 previous_callout = NULL;
2364     }
2365     if ((options & PCRE_AUTO_CALLOUT) != 0)
2366     {
2367     previous_callout = code;
2368     code = auto_callout(code, ptr, cd);
2369     }
2370     goto NORMAL_CHAR;
2371     }
2372     }
2373    
2374     /* Fill in length of a previous callout, except when the next thing is
2375     a quantifier. */
2376    
2377     is_quantifier = c == '*' || c == '+' || c == '?' ||
2378     (c == '{' && is_counted_repeat(ptr+1));
2379    
2380     if (!is_quantifier && previous_callout != NULL &&
2381     after_manual_callout-- <= 0)
2382     {
2383 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2384     complete_callout(previous_callout, ptr, cd);
2385 nigel 77 previous_callout = NULL;
2386     }
2387    
2388     /* In extended mode, skip white space and comments */
2389    
2390     if ((options & PCRE_EXTENDED) != 0)
2391     {
2392     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2393     if (c == '#')
2394     {
2395 nigel 93 while (*(++ptr) != 0)
2396 nigel 91 {
2397 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2398 nigel 91 }
2399 nigel 93 if (*ptr != 0) continue;
2400    
2401 nigel 91 /* Else fall through to handle end of string */
2402     c = 0;
2403 nigel 77 }
2404     }
2405    
2406     /* No auto callout for quantifiers. */
2407    
2408     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2409     {
2410     previous_callout = code;
2411     code = auto_callout(code, ptr, cd);
2412     }
2413    
2414     switch(c)
2415     {
2416 nigel 93 /* ===================================================================*/
2417     case 0: /* The branch terminates at string end */
2418     case '|': /* or | or ) */
2419 nigel 77 case ')':
2420     *firstbyteptr = firstbyte;
2421     *reqbyteptr = reqbyte;
2422     *codeptr = code;
2423     *ptrptr = ptr;
2424 nigel 93 if (lengthptr != NULL)
2425     {
2426 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2427     {
2428     *errorcodeptr = ERR20;
2429     goto FAILED;
2430     }
2431 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2432     DPRINTF((">> end branch\n"));
2433     }
2434 nigel 77 return TRUE;
2435    
2436 nigel 93
2437     /* ===================================================================*/
2438 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2439     the setting of any following char as a first character. */
2440    
2441     case '^':
2442     if ((options & PCRE_MULTILINE) != 0)
2443     {
2444     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2445     }
2446     previous = NULL;
2447     *code++ = OP_CIRC;
2448     break;
2449    
2450     case '$':
2451     previous = NULL;
2452     *code++ = OP_DOLL;
2453     break;
2454    
2455     /* There can never be a first char if '.' is first, whatever happens about
2456     repeats. The value of reqbyte doesn't change either. */
2457    
2458     case '.':
2459     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2460     zerofirstbyte = firstbyte;
2461     zeroreqbyte = reqbyte;
2462     previous = code;
2463     *code++ = OP_ANY;
2464     break;
2465    
2466 nigel 93
2467     /* ===================================================================*/
2468 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2469     32-byte bitmap of the permitted characters, except in the special case
2470     where there is only one such character. For negated classes, we build the
2471     map as usual, then invert it at the end. However, we use a different opcode
2472     so that data characters > 255 can be handled correctly.
2473 nigel 77
2474     If the class contains characters outside the 0-255 range, a different
2475     opcode is compiled. It may optionally have a bit map for characters < 256,
2476     but those above are are explicitly listed afterwards. A flag byte tells
2477     whether the bitmap is present, and whether this is a negated class or not.
2478     */
2479    
2480     case '[':
2481     previous = code;
2482    
2483     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2484     they are encountered at the top level, so we'll do that too. */
2485    
2486     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2487     check_posix_syntax(ptr, &tempptr, cd))
2488     {
2489     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2490     goto FAILED;
2491     }
2492    
2493 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2494 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2495 ph10 205 skip them too. This makes for compatibility with Perl. */
2496 ph10 208
2497 ph10 205 negate_class = FALSE;
2498     for (;;)
2499 nigel 77 {
2500     c = *(++ptr);
2501 ph10 205 if (c == '\\')
2502     {
2503 ph10 208 if (ptr[1] == 'E') ptr++;
2504 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2505 ph10 208 else break;
2506 ph10 205 }
2507     else if (!negate_class && c == '^')
2508     negate_class = TRUE;
2509     else break;
2510 ph10 208 }
2511 nigel 77
2512     /* Keep a count of chars with values < 256 so that we can optimize the case
2513 nigel 93 of just a single character (as long as it's < 256). However, For higher
2514     valued UTF-8 characters, we don't yet do any optimization. */
2515 nigel 77
2516     class_charcount = 0;
2517     class_lastchar = -1;
2518    
2519 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2520     temporary bit of memory, in case the class contains only 1 character (less
2521     than 256), because in that case the compiled code doesn't use the bit map.
2522     */
2523    
2524     memset(classbits, 0, 32 * sizeof(uschar));
2525    
2526 nigel 77 #ifdef SUPPORT_UTF8
2527     class_utf8 = FALSE; /* No chars >= 256 */
2528 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2529 nigel 77 #endif
2530    
2531     /* Process characters until ] is reached. By writing this as a "do" it
2532 nigel 93 means that an initial ] is taken as a data character. At the start of the
2533     loop, c contains the first byte of the character. */
2534 nigel 77
2535 nigel 93 if (c != 0) do
2536 nigel 77 {
2537 nigel 93 const uschar *oldptr;
2538    
2539 nigel 77 #ifdef SUPPORT_UTF8
2540     if (utf8 && c > 127)
2541     { /* Braces are required because the */
2542     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2543     }
2544     #endif
2545    
2546     /* Inside \Q...\E everything is literal except \E */
2547    
2548     if (inescq)
2549     {
2550 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2551 nigel 77 {
2552 nigel 93 inescq = FALSE; /* Reset literal state */
2553     ptr++; /* Skip the 'E' */
2554     continue; /* Carry on with next */
2555 nigel 77 }
2556 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2557 nigel 77 }
2558    
2559     /* Handle POSIX class names. Perl allows a negation extension of the
2560     form [:^name:]. A square bracket that doesn't match the syntax is
2561     treated as a literal. We also recognize the POSIX constructions
2562     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2563     5.6 and 5.8 do. */
2564    
2565     if (c == '[' &&
2566     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2567     check_posix_syntax(ptr, &tempptr, cd))
2568     {
2569     BOOL local_negate = FALSE;
2570 nigel 87 int posix_class, taboffset, tabopt;
2571 nigel 77 register const uschar *cbits = cd->cbits;
2572 nigel 87 uschar pbits[32];
2573 nigel 77
2574     if (ptr[1] != ':')
2575     {
2576     *errorcodeptr = ERR31;
2577     goto FAILED;
2578     }
2579    
2580     ptr += 2;
2581     if (*ptr == '^')
2582     {
2583     local_negate = TRUE;
2584     ptr++;
2585     }
2586    
2587     posix_class = check_posix_name(ptr, tempptr - ptr);
2588     if (posix_class < 0)
2589     {
2590     *errorcodeptr = ERR30;
2591     goto FAILED;
2592     }
2593    
2594     /* If matching is caseless, upper and lower are converted to
2595     alpha. This relies on the fact that the class table starts with
2596     alpha, lower, upper as the first 3 entries. */
2597    
2598     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2599     posix_class = 0;
2600    
2601 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2602     because we may be adding and subtracting from it, and we don't want to
2603     subtract bits that may be in the main map already. At the end we or the
2604     result into the bit map that is being built. */
2605 nigel 77
2606     posix_class *= 3;
2607 nigel 87
2608     /* Copy in the first table (always present) */
2609    
2610     memcpy(pbits, cbits + posix_class_maps[posix_class],
2611     32 * sizeof(uschar));
2612    
2613     /* If there is a second table, add or remove it as required. */
2614    
2615     taboffset = posix_class_maps[posix_class + 1];
2616     tabopt = posix_class_maps[posix_class + 2];
2617    
2618     if (taboffset >= 0)
2619 nigel 77 {
2620 nigel 87 if (tabopt >= 0)
2621     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2622 nigel 77 else
2623 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2624 nigel 77 }
2625    
2626 nigel 87 /* Not see if we need to remove any special characters. An option
2627     value of 1 removes vertical space and 2 removes underscore. */
2628    
2629     if (tabopt < 0) tabopt = -tabopt;
2630     if (tabopt == 1) pbits[1] &= ~0x3c;
2631     else if (tabopt == 2) pbits[11] &= 0x7f;
2632    
2633     /* Add the POSIX table or its complement into the main table that is
2634     being built and we are done. */
2635    
2636     if (local_negate)
2637     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2638     else
2639     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2640    
2641 nigel 77 ptr = tempptr + 1;
2642     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2643     continue; /* End of POSIX syntax handling */
2644     }
2645    
2646     /* Backslash may introduce a single character, or it may introduce one
2647 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2648     case. Inside a class (and only there) it is treated as backspace.
2649     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2650 ph10 205 to 'or' into the one we are building. We assume they have more than one
2651 nigel 77 character in them, so set class_charcount bigger than one. */
2652    
2653     if (c == '\\')
2654     {
2655 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2656     if (*errorcodeptr != 0) goto FAILED;
2657 nigel 77
2658     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2659     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2660 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2661 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2662     {
2663     if (ptr[1] == '\\' && ptr[2] == 'E')
2664     {
2665     ptr += 2; /* avoid empty string */
2666     }
2667     else inescq = TRUE;
2668     continue;
2669     }
2670    
2671     if (c < 0)
2672     {
2673     register const uschar *cbits = cd->cbits;
2674     class_charcount += 2; /* Greater than 1 is what matters */
2675 nigel 93
2676     /* Save time by not doing this in the pre-compile phase. */
2677    
2678     if (lengthptr == NULL) switch (-c)
2679 nigel 77 {
2680     case ESC_d:
2681     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2682     continue;
2683    
2684     case ESC_D:
2685     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2686     continue;
2687    
2688     case ESC_w:
2689     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2690     continue;
2691    
2692     case ESC_W:
2693     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2694     continue;
2695    
2696     case ESC_s:
2697     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2698     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2699     continue;
2700    
2701     case ESC_S:
2702     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2703     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2704     continue;
2705    
2706 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2707     continue;
2708 ph10 180
2709 nigel 93 default: /* Not recognized; fall through */
2710     break; /* Need "default" setting to stop compiler warning. */
2711     }
2712    
2713     /* In the pre-compile phase, just do the recognition. */
2714    
2715     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2716     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2717 ph10 180
2718 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2719     they use extra memory. */
2720 ph10 180
2721 ph10 178 if (-c == ESC_h)
2722     {
2723     SETBIT(classbits, 0x09); /* VT */
2724     SETBIT(classbits, 0x20); /* SPACE */
2725 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2726 ph10 178 #ifdef SUPPORT_UTF8
2727     if (utf8)
2728 ph10 180 {
2729 ph10 178 class_utf8 = TRUE;
2730     *class_utf8data++ = XCL_SINGLE;
2731 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2732 ph10 178 *class_utf8data++ = XCL_SINGLE;
2733 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2734     *class_utf8data++ = XCL_RANGE;
2735     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2736     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2737 ph10 178 *class_utf8data++ = XCL_SINGLE;
2738 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2739 ph10 178 *class_utf8data++ = XCL_SINGLE;
2740 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2741 ph10 178 *class_utf8data++ = XCL_SINGLE;
2742 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2743     }
2744     #endif
2745     continue;
2746     }
2747 nigel 93
2748 ph10 178 if (-c == ESC_H)
2749     {
2750     for (c = 0; c < 32; c++)
2751     {
2752     int x = 0xff;
2753     switch (c)
2754 ph10 180 {
2755 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2756     case 0x20/8: x ^= 1 << (0x20%8); break;
2757     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2758     default: break;
2759     }
2760     classbits[c] |= x;
2761 ph10 180 }
2762    
2763 ph10 178 #ifdef SUPPORT_UTF8
2764     if (utf8)
2765 ph10 180 {
2766 ph10 178 class_utf8 = TRUE;
2767 ph10 180 *class_utf8data++ = XCL_RANGE;
2768     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2769     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2770     *class_utf8data++ = XCL_RANGE;
2771     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2772     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2773     *class_utf8data++ = XCL_RANGE;
2774     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2775     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2776     *class_utf8data++ = XCL_RANGE;
2777     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2778     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2779     *class_utf8data++ = XCL_RANGE;
2780     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2781     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2782     *class_utf8data++ = XCL_RANGE;
2783     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2784     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2785     *class_utf8data++ = XCL_RANGE;
2786     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2787     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2788     }
2789     #endif
2790     continue;
2791     }
2792 ph10 178
2793     if (-c == ESC_v)
2794     {
2795     SETBIT(classbits, 0x0a); /* LF */
2796     SETBIT(classbits, 0x0b); /* VT */
2797 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2798     SETBIT(classbits, 0x0d); /* CR */
2799     SETBIT(classbits, 0x85); /* NEL */
2800 ph10 178 #ifdef SUPPORT_UTF8
2801     if (utf8)
2802 ph10 180 {
2803 ph10 178 class_utf8 = TRUE;
2804 ph10 180 *class_utf8data++ = XCL_RANGE;
2805     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2806     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2807     }
2808     #endif
2809     continue;
2810     }
2811 ph10 178
2812     if (-c == ESC_V)
2813     {
2814     for (c = 0; c < 32; c++)
2815     {
2816     int x = 0xff;
2817     switch (c)
2818 ph10 180 {
2819 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2820     x ^= 1 << (0x0b%8);
2821     x ^= 1 << (0x0c%8);
2822 ph10 180 x ^= 1 << (0x0d%8);
2823 ph10 178 break;
2824     case 0x85/8: x ^= 1 << (0x85%8); break;
2825     default: break;
2826     }
2827     classbits[c] |= x;
2828 ph10 180 }
2829    
2830 ph10 178 #ifdef SUPPORT_UTF8
2831     if (utf8)
2832 ph10 180 {
2833 ph10 178 class_utf8 = TRUE;
2834 ph10 180 *class_utf8data++ = XCL_RANGE;
2835     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2836     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2837     *class_utf8data++ = XCL_RANGE;
2838     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2839     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2840     }
2841     #endif
2842     continue;
2843     }
2844 ph10 178
2845 nigel 93 /* We need to deal with \P and \p in both phases. */
2846    
2847 nigel 77 #ifdef SUPPORT_UCP
2848 nigel 93 if (-c == ESC_p || -c == ESC_P)
2849     {
2850     BOOL negated;
2851     int pdata;
2852     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2853     if (ptype < 0) goto FAILED;
2854     class_utf8 = TRUE;
2855     *class_utf8data++ = ((-c == ESC_p) != negated)?
2856     XCL_PROP : XCL_NOTPROP;
2857     *class_utf8data++ = ptype;
2858     *class_utf8data++ = pdata;
2859     class_charcount -= 2; /* Not a < 256 character */
2860 nigel 77 continue;
2861 nigel 93 }
2862 nigel 77 #endif
2863 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2864     strict mode. By default, for compatibility with Perl, they are
2865     treated as literals. */
2866 nigel 77
2867 nigel 93 if ((options & PCRE_EXTRA) != 0)
2868     {
2869     *errorcodeptr = ERR7;
2870     goto FAILED;
2871     }
2872 nigel 77
2873 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2874     c = *ptr; /* Get the final character and fall through */
2875 nigel 77 }
2876    
2877     /* Fall through if we have a single character (c >= 0). This may be
2878 nigel 93 greater than 256 in UTF-8 mode. */
2879 nigel 77
2880     } /* End of backslash handling */
2881    
2882     /* A single character may be followed by '-' to form a range. However,
2883     Perl does not permit ']' to be the end of the range. A '-' character
2884 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2885     entirely. The code for handling \Q and \E is messy. */
2886 nigel 77
2887 nigel 93 CHECK_RANGE:
2888     while (ptr[1] == '\\' && ptr[2] == 'E')
2889 nigel 77 {
2890 nigel 93 inescq = FALSE;
2891     ptr += 2;
2892     }
2893    
2894     oldptr = ptr;
2895    
2896     if (!inescq && ptr[1] == '-')
2897     {
2898 nigel 77 int d;
2899     ptr += 2;
2900 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2901 nigel 77
2902 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2903     mode. */
2904    
2905     while (*ptr == '\\' && ptr[1] == 'Q')
2906     {
2907     ptr += 2;
2908     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2909     inescq = TRUE;
2910     break;
2911     }
2912    
2913     if (*ptr == 0 || (!inescq && *ptr == ']'))
2914     {
2915     ptr = oldptr;
2916     goto LONE_SINGLE_CHARACTER;
2917     }
2918    
2919 nigel 77 #ifdef SUPPORT_UTF8
2920     if (utf8)
2921     { /* Braces are required because the */
2922     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2923     }
2924     else
2925     #endif
2926     d = *ptr; /* Not UTF-8 mode */
2927    
2928     /* The second part of a range can be a single-character escape, but
2929     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2930     in such circumstances. */
2931    
2932 nigel 93 if (!inescq && d == '\\')
2933 nigel 77 {
2934 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2935     if (*errorcodeptr != 0) goto FAILED;
2936 nigel 77
2937 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2938     special means the '-' was literal */
2939 nigel 77
2940     if (d < 0)
2941     {
2942     if (d == -ESC_b) d = '\b';
2943 nigel 93 else if (d == -ESC_X) d = 'X';
2944     else if (d == -ESC_R) d = 'R'; else
2945 nigel 77 {
2946 nigel 93 ptr = oldptr;
2947 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2948     }
2949     }
2950     }
2951    
2952 nigel 93 /* Check that the two values are in the correct order. Optimize
2953     one-character ranges */
2954 nigel 77
2955 nigel 93 if (d < c)
2956     {
2957     *errorcodeptr = ERR8;
2958     goto FAILED;
2959     }
2960    
2961 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2962    
2963     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2964     matching, we have to use an XCLASS with extra data items. Caseless
2965     matching for characters > 127 is available only if UCP support is
2966     available. */
2967    
2968     #ifdef SUPPORT_UTF8
2969     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2970     {
2971     class_utf8 = TRUE;
2972    
2973     /* With UCP support, we can find the other case equivalents of
2974     the relevant characters. There may be several ranges. Optimize how
2975     they fit with the basic range. */
2976    
2977     #ifdef SUPPORT_UCP
2978     if ((options & PCRE_CASELESS) != 0)
2979     {
2980 nigel 93 unsigned int occ, ocd;
2981     unsigned int cc = c;
2982     unsigned int origd = d;
2983 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2984     {
2985 ph10 180 if (occ >= (unsigned int)c &&
2986     ocd <= (unsigned int)d)
2987 ph10 176 continue; /* Skip embedded ranges */
2988 nigel 77
2989 ph10 180 if (occ < (unsigned int)c &&
2990 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2991 nigel 77 { /* if there is overlap, */
2992     c = occ; /* noting that if occ < c */
2993     continue; /* we can't have ocd > d */
2994     } /* because a subrange is */
2995 ph10 180 if (ocd > (unsigned int)d &&
2996 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2997 nigel 77 { /* the basic range. */
2998     d = ocd;
2999     continue;
3000     }
3001    
3002     if (occ == ocd)
3003     {
3004     *class_utf8data++ = XCL_SINGLE;
3005     }
3006     else
3007     {
3008     *class_utf8data++ = XCL_RANGE;
3009     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3010     }
3011     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3012     }
3013     }
3014     #endif /* SUPPORT_UCP */
3015    
3016     /* Now record the original range, possibly modified for UCP caseless
3017     overlapping ranges. */
3018    
3019     *class_utf8data++ = XCL_RANGE;
3020     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3021     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3022    
3023     /* With UCP support, we are done. Without UCP support, there is no
3024     caseless matching for UTF-8 characters > 127; we can use the bit map
3025     for the smaller ones. */
3026    
3027     #ifdef SUPPORT_UCP
3028     continue; /* With next character in the class */
3029     #else
3030     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3031    
3032     /* Adjust upper limit and fall through to set up the map */
3033    
3034     d = 127;
3035    
3036     #endif /* SUPPORT_UCP */
3037     }
3038     #endif /* SUPPORT_UTF8 */
3039    
3040     /* We use the bit map for all cases when not in UTF-8 mode; else
3041     ranges that lie entirely within 0-127 when there is UCP support; else
3042     for partial ranges without UCP support. */
3043    
3044 nigel 93 class_charcount += d - c + 1;
3045     class_lastchar = d;
3046    
3047     /* We can save a bit of time by skipping this in the pre-compile. */
3048    
3049     if (lengthptr == NULL) for (; c <= d; c++)
3050 nigel 77 {
3051     classbits[c/8] |= (1 << (c&7));
3052     if ((options & PCRE_CASELESS) != 0)
3053     {
3054     int uc = cd->fcc[c]; /* flip case */
3055     classbits[uc/8] |= (1 << (uc&7));
3056     }
3057     }
3058    
3059     continue; /* Go get the next char in the class */
3060     }
3061    
3062     /* Handle a lone single character - we can get here for a normal
3063     non-escape char, or after \ that introduces a single character or for an
3064     apparent range that isn't. */
3065    
3066     LONE_SINGLE_CHARACTER:
3067    
3068     /* Handle a character that cannot go in the bit map */
3069    
3070     #ifdef SUPPORT_UTF8
3071     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3072     {
3073     class_utf8 = TRUE;
3074     *class_utf8data++ = XCL_SINGLE;
3075     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3076    
3077     #ifdef SUPPORT_UCP
3078     if ((options & PCRE_CASELESS) != 0)
3079     {
3080 nigel 93 unsigned int othercase;
3081     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3082 nigel 77 {
3083     *class_utf8data++ = XCL_SINGLE;
3084     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3085     }
3086     }
3087     #endif /* SUPPORT_UCP */
3088    
3089     }
3090     else
3091     #endif /* SUPPORT_UTF8 */
3092    
3093     /* Handle a single-byte character */
3094     {
3095     classbits[c/8] |= (1 << (c&7));
3096     if ((options & PCRE_CASELESS) != 0)
3097     {
3098     c = cd->fcc[c]; /* flip case */
3099     classbits[c/8] |= (1 << (c&7));
3100     }
3101     class_charcount++;
3102     class_lastchar = c;
3103     }
3104     }
3105    
3106 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3107 nigel 77
3108 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3109 nigel 77
3110 nigel 93 if (c == 0) /* Missing terminating ']' */
3111     {
3112     *errorcodeptr = ERR6;
3113     goto FAILED;
3114     }
3115 ph10 208
3116 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3117     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3118     can optimize the negative case only if there were no characters >= 128
3119     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3120     single-bytes only. This is an historical hangover. Maybe one day we can
3121     tidy these opcodes to handle multi-byte characters.
3122    
3123     The optimization throws away the bit map. We turn the item into a
3124     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3125     that OP_NOT does not support multibyte characters. In the positive case, it
3126     can cause firstbyte to be set. Otherwise, there can be no first char if
3127     this item is first, whatever repeat count may follow. In the case of
3128     reqbyte, save the previous value for reinstating. */
3129    
3130     #ifdef SUPPORT_UTF8
3131     if (class_charcount == 1 &&
3132     (!utf8 ||
3133     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3134    
3135     #else
3136     if (class_charcount == 1)
3137     #endif
3138     {
3139     zeroreqbyte = reqbyte;
3140    
3141     /* The OP_NOT opcode works on one-byte characters only. */
3142    
3143     if (negate_class)
3144     {
3145     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3146     zerofirstbyte = firstbyte;
3147     *code++ = OP_NOT;
3148     *code++ = class_lastchar;
3149     break;
3150     }
3151    
3152     /* For a single, positive character, get the value into mcbuffer, and
3153     then we can handle this with the normal one-character code. */
3154    
3155     #ifdef SUPPORT_UTF8
3156     if (utf8 && class_lastchar > 127)
3157     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3158     else
3159     #endif
3160     {
3161     mcbuffer[0] = class_lastchar;
3162     mclength = 1;
3163     }
3164     goto ONE_CHAR;
3165     } /* End of 1-char optimization */
3166    
3167     /* The general case - not the one-char optimization. If this is the first
3168     thing in the branch, there can be no first char setting, whatever the
3169     repeat count. Any reqbyte setting must remain unchanged after any kind of
3170     repeat. */
3171    
3172     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3173     zerofirstbyte = firstbyte;
3174     zeroreqbyte = reqbyte;
3175    
3176     /* If there are characters with values > 255, we have to compile an
3177     extended class, with its own opcode. If there are no characters < 256,
3178 nigel 93 we can omit the bitmap in the actual compiled code. */
3179 nigel 77
3180     #ifdef SUPPORT_UTF8
3181     if (class_utf8)
3182     {
3183     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3184     *code++ = OP_XCLASS;
3185     code += LINK_SIZE;
3186     *code = negate_class? XCL_NOT : 0;
3187    
3188 nigel 93 /* If the map is required, move up the extra data to make room for it;
3189     otherwise just move the code pointer to the end of the extra data. */
3190 nigel 77
3191     if (class_charcount > 0)
3192     {
3193     *code++ |= XCL_MAP;
3194 nigel 93 memmove(code + 32, code, class_utf8data - code);
3195 nigel 77 memcpy(code, classbits, 32);
3196 nigel 93 code = class_utf8data + 32;
3197 nigel 77 }
3198 nigel 93 else code = class_utf8data;
3199 nigel 77
3200     /* Now fill in the complete length of the item */
3201    
3202     PUT(previous, 1, code - previous);
3203     break; /* End of class handling */
3204     }
3205     #endif
3206    
3207     /* If there are no characters > 255, negate the 32-byte map if necessary,
3208     and copy it into the code vector. If this is the first thing in the branch,
3209     there can be no first char setting, whatever the repeat count. Any reqbyte
3210     setting must remain unchanged after any kind of repeat. */
3211    
3212     if (negate_class)
3213     {
3214     *code++ = OP_NCLASS;
3215 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3216     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3217 nigel 77 }
3218     else
3219     {
3220     *code++ = OP_CLASS;
3221     memcpy(code, classbits, 32);
3222     }
3223     code += 32;
3224     break;
3225    
3226 nigel 93
3227     /* ===================================================================*/
3228 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3229     has been tested above. */
3230    
3231     case '{':
3232     if (!is_quantifier) goto NORMAL_CHAR;
3233     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3234     if (*errorcodeptr != 0) goto FAILED;
3235     goto REPEAT;
3236    
3237     case '*':
3238     repeat_min = 0;
3239     repeat_max = -1;
3240     goto REPEAT;
3241    
3242     case '+':
3243     repeat_min = 1;
3244     repeat_max = -1;
3245     goto REPEAT;
3246    
3247     case '?':
3248     repeat_min = 0;
3249     repeat_max = 1;
3250    
3251     REPEAT:
3252     if (previous == NULL)
3253     {
3254     *errorcodeptr = ERR9;
3255     goto FAILED;
3256     }
3257    
3258     if (repeat_min == 0)
3259     {
3260     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3261     reqbyte = zeroreqbyte; /* Ditto */
3262     }
3263    
3264     /* Remember whether this is a variable length repeat */
3265    
3266     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3267    
3268     op_type = 0; /* Default single-char op codes */
3269     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3270    
3271     /* Save start of previous item, in case we have to move it up to make space
3272     for an inserted OP_ONCE for the additional '+' extension. */
3273    
3274     tempcode = previous;
3275    
3276     /* If the next character is '+', we have a possessive quantifier. This
3277     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3278     If the next character is '?' this is a minimizing repeat, by default,
3279     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3280     repeat type to the non-default. */
3281    
3282     if (ptr[1] == '+')
3283     {
3284     repeat_type = 0; /* Force greedy */
3285     possessive_quantifier = TRUE;
3286     ptr++;
3287     }
3288     else if (ptr[1] == '?')
3289     {
3290     repeat_type = greedy_non_default;
3291     ptr++;
3292     }
3293     else repeat_type = greedy_default;
3294    
3295     /* If previous was a character match, abolish the item and generate a
3296     repeat item instead. If a char item has a minumum of more than one, ensure
3297     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3298     the first thing in a branch because the x will have gone into firstbyte
3299     instead. */
3300    
3301     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3302     {
3303     /* Deal with UTF-8 characters that take up more than one byte. It's
3304     easier to write this out separately than try to macrify it. Use c to
3305     hold the length of the character in bytes, plus 0x80 to flag that it's a
3306     length rather than a small character. */
3307    
3308     #ifdef SUPPORT_UTF8
3309     if (utf8 && (code[-1] & 0x80) != 0)
3310     {
3311     uschar *lastchar = code - 1;
3312     while((*lastchar & 0xc0) == 0x80) lastchar--;
3313     c = code - lastchar; /* Length of UTF-8 character */
3314     memcpy(utf8_char, lastchar, c); /* Save the char */
3315     c |= 0x80; /* Flag c as a length */
3316     }
3317     else
3318     #endif
3319    
3320     /* Handle the case of a single byte - either with no UTF8 support, or
3321     with UTF-8 disabled, or for a UTF-8 character < 128. */
3322    
3323     {
3324     c = code[-1];
3325     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3326     }
3327    
3328 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3329     the line is something that cannot possibly match this character. If so,
3330     automatically possessifying this item gains some performance in the case
3331     where the match fails. */
3332    
3333     if (!possessive_quantifier &&
3334     repeat_max < 0 &&
3335     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3336     options, cd))
3337     {
3338     repeat_type = 0; /* Force greedy */
3339     possessive_quantifier = TRUE;
3340     }
3341    
3342 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3343     }
3344    
3345     /* If previous was a single negated character ([^a] or similar), we use
3346     one of the special opcodes, replacing it. The code is shared with single-
3347     character repeats by setting opt_type to add a suitable offset into
3348 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3349     currently used only for single-byte chars. */
3350 nigel 77
3351     else if (*previous == OP_NOT)
3352     {
3353     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3354     c = previous[1];
3355 nigel 93 if (!possessive_quantifier &&
3356     repeat_max < 0 &&
3357     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3358     {
3359     repeat_type = 0; /* Force greedy */
3360     possessive_quantifier = TRUE;
3361     }
3362 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3363     }
3364    
3365     /* If previous was a character type match (\d or similar), abolish it and
3366     create a suitable repeat item. The code is shared with single-character
3367     repeats by setting op_type to add a suitable offset into repeat_type. Note
3368     the the Unicode property types will be present only when SUPPORT_UCP is
3369     defined, but we don't wrap the little bits of code here because it just
3370     makes it horribly messy. */
3371    
3372     else if (*previous < OP_EODN)
3373     {
3374     uschar *oldcode;
3375 nigel 87 int prop_type, prop_value;
3376 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3377     c = *previous;
3378    
3379 nigel 93 if (!possessive_quantifier &&
3380     repeat_max < 0 &&
3381     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3382     {
3383     repeat_type = 0; /* Force greedy */
3384     possessive_quantifier = TRUE;
3385     }
3386    
3387 nigel 77 OUTPUT_SINGLE_REPEAT:
3388 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3389     {
3390     prop_type = previous[1];
3391     prop_value = previous[2];
3392     }
3393     else prop_type = prop_value = -1;
3394 nigel 77
3395     oldcode = code;
3396     code = previous; /* Usually overwrite previous item */
3397    
3398     /* If the maximum is zero then the minimum must also be zero; Perl allows
3399     this case, so we do too - by simply omitting the item altogether. */
3400    
3401     if (repeat_max == 0) goto END_REPEAT;
3402    
3403     /* All real repeats make it impossible to handle partial matching (maybe
3404     one day we will be able to remove this restriction). */
3405    
3406     if (repeat_max != 1) cd->nopartial = TRUE;
3407    
3408     /* Combine the op_type with the repeat_type */
3409    
3410     repeat_type += op_type;
3411    
3412     /* A minimum of zero is handled either as the special case * or ?, or as
3413     an UPTO, with the maximum given. */
3414    
3415     if (repeat_min == 0)
3416     {
3417     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3418     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3419     else
3420     {
3421     *code++ = OP_UPTO + repeat_type;
3422     PUT2INC(code, 0, repeat_max);
3423     }
3424     }
3425    
3426     /* A repeat minimum of 1 is optimized into some special cases. If the
3427 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3428 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3429     one less than the maximum. */
3430    
3431     else if (repeat_min == 1)
3432     {
3433     if (repeat_max == -1)
3434     *code++ = OP_PLUS + repeat_type;
3435     else
3436     {
3437     code = oldcode; /* leave previous item in place */
3438     if (repeat_max == 1) goto END_REPEAT;
3439     *code++ = OP_UPTO + repeat_type;
3440     PUT2INC(code, 0, repeat_max - 1);
3441     }
3442     }
3443    
3444     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3445     handled as an EXACT followed by an UPTO. */
3446    
3447     else
3448     {
3449     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3450     PUT2INC(code, 0, repeat_min);
3451    
3452     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3453     we have to insert the character for the previous code. For a repeated
3454 nigel 87 Unicode property match, there are two extra bytes that define the
3455 nigel 77 required property. In UTF-8 mode, long characters have their length in
3456     c, with the 0x80 bit as a flag. */
3457    
3458     if (repeat_max < 0)
3459     {
3460     #ifdef SUPPORT_UTF8
3461     if (utf8 && c >= 128)
3462     {
3463     memcpy(code, utf8_char, c & 7);
3464     code += c & 7;
3465     }
3466     else
3467     #endif
3468     {
3469     *code++ = c;
3470 nigel 87 if (prop_type >= 0)
3471     {
3472     *code++ = prop_type;
3473     *code++ = prop_value;
3474     }
3475 nigel 77 }
3476     *code++ = OP_STAR + repeat_type;
3477     }
3478    
3479     /* Else insert an UPTO if the max is greater than the min, again
3480 nigel 93 preceded by the character, for the previously inserted code. If the
3481     UPTO is just for 1 instance, we can use QUERY instead. */
3482 nigel 77
3483     else if (repeat_max != repeat_min)
3484     {
3485     #ifdef SUPPORT_UTF8
3486     if (utf8 && c >= 128)
3487     {
3488     memcpy(code, utf8_char, c & 7);
3489     code += c & 7;
3490     }
3491     else
3492     #endif
3493     *code++ = c;
3494 nigel 87 if (prop_type >= 0)
3495     {
3496     *code++ = prop_type;
3497     *code++ = prop_value;
3498     }
3499 nigel 77 repeat_max -= repeat_min;
3500 nigel 93
3501     if (repeat_max == 1)
3502     {
3503     *code++ = OP_QUERY + repeat_type;
3504     }
3505     else
3506     {
3507     *code++ = OP_UPTO + repeat_type;
3508     PUT2INC(code, 0, repeat_max);
3509     }
3510 nigel 77 }
3511     }
3512    
3513     /* The character or character type itself comes last in all cases. */
3514    
3515     #ifdef SUPPORT_UTF8
3516     if (utf8 && c >= 128)
3517     {
3518     memcpy(code, utf8_char, c & 7);
3519     code += c & 7;
3520     }
3521     else
3522     #endif
3523     *code++ = c;
3524    
3525 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3526     define the required property. */
3527 nigel 77
3528     #ifdef SUPPORT_UCP
3529 nigel 87 if (prop_type >= 0)
3530     {
3531     *code++ = prop_type;
3532     *code++ = prop_value;
3533     }
3534 nigel 77 #endif
3535     }
3536    
3537     /* If previous was a character class or a back reference, we put the repeat
3538     stuff after it, but just skip the item if the repeat was {0,0}. */
3539    
3540     else if (*previous == OP_CLASS ||
3541     *previous == OP_NCLASS ||
3542     #ifdef SUPPORT_UTF8
3543     *previous == OP_XCLASS ||
3544     #endif
3545     *previous == OP_REF)
3546     {
3547     if (repeat_max == 0)
3548     {
3549     code = previous;
3550     goto END_REPEAT;
3551     }
3552    
3553     /* All real repeats make it impossible to handle partial matching (maybe
3554     one day we will be able to remove this restriction). */
3555    
3556     if (repeat_max != 1) cd->nopartial = TRUE;
3557    
3558     if (repeat_min == 0 && repeat_max == -1)
3559     *code++ = OP_CRSTAR + repeat_type;
3560     else if (repeat_min == 1 && repeat_max == -1)
3561     *code++ = OP_CRPLUS + repeat_type;
3562     else if (repeat_min == 0 && repeat_max == 1)
3563     *code++ = OP_CRQUERY + repeat_type;
3564     else
3565     {
3566     *code++ = OP_CRRANGE + repeat_type;
3567     PUT2INC(code, 0, repeat_min);
3568     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3569     PUT2INC(code, 0, repeat_max);
3570     }
3571     }
3572    
3573     /* If previous was a bracket group, we may have to replicate it in certain
3574     cases. */
3575    
3576 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3577     *previous == OP_ONCE || *previous == OP_COND)
3578 nigel 77 {
3579     register int i;
3580     int ketoffset = 0;
3581     int len = code - previous;
3582     uschar *bralink = NULL;
3583    
3584 nigel 93 /* Repeating a DEFINE group is pointless */
3585    
3586     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3587     {
3588     *errorcodeptr = ERR55;
3589     goto FAILED;
3590     }
3591    
3592 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3593     by scanning through from the start, and compute the offset back to it
3594     from the current code pointer. There may be an OP_OPT setting following
3595     the final KET, so we can't find the end just by going back from the code
3596     pointer. */
3597    
3598     if (repeat_max == -1)
3599     {
3600     register uschar *ket = previous;
3601     do ket += GET(ket, 1); while (*ket != OP_KET);
3602     ketoffset = code - ket;
3603     }
3604    
3605     /* The case of a zero minimum is special because of the need to stick
3606     OP_BRAZERO in front of it, and because the group appears once in the
3607     data, whereas in other cases it appears the minimum number of times. For
3608     this reason, it is simplest to treat this case separately, as otherwise
3609     the code gets far too messy. There are several special subcases when the
3610     minimum is zero. */
3611    
3612     if (repeat_min == 0)
3613     {
3614     /* If the maximum is also zero, we just omit the group from the output
3615     altogether. */
3616    
3617     if (repeat_max == 0)
3618     {
3619     code = previous;
3620     goto END_REPEAT;
3621     }
3622    
3623     /* If the maximum is 1 or unlimited, we just have to stick in the
3624     BRAZERO and do no more at this point. However, we do need to adjust
3625     any OP_RECURSE calls inside the group that refer to the group itself or
3626 nigel 93 any internal or forward referenced group, because the offset is from
3627     the start of the whole regex. Temporarily terminate the pattern while
3628     doing this. */
3629 nigel 77
3630     if (repeat_max <= 1)
3631     {
3632     *code = OP_END;
3633 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3634 nigel 77 memmove(previous+1, previous, len);
3635     code++;
3636     *previous++ = OP_BRAZERO + repeat_type;
3637     }
3638    
3639     /* If the maximum is greater than 1 and limited, we have to replicate
3640     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3641     The first one has to be handled carefully because it's the original
3642     copy, which has to be moved up. The remainder can be handled by code
3643     that is common with the non-zero minimum case below. We have to
3644     adjust the value or repeat_max, since one less copy is required. Once
3645     again, we may have to adjust any OP_RECURSE calls inside the group. */
3646    
3647     else
3648     {
3649     int offset;
3650     *code = OP_END;
3651 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3652 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3653     code += 2 + LINK_SIZE;
3654     *previous++ = OP_BRAZERO + repeat_type;
3655     *previous++ = OP_BRA;
3656    
3657     /* We chain together the bracket offset fields that have to be
3658     filled in later when the ends of the brackets are reached. */
3659    
3660     offset = (bralink == NULL)? 0 : previous - bralink;
3661     bralink = previous;
3662     PUTINC(previous, 0, offset);
3663     }
3664    
3665     repeat_max--;
3666     }
3667    
3668     /* If the minimum is greater than zero, replicate the group as many
3669     times as necessary, and adjust the maximum to the number of subsequent
3670     copies that we need. If we set a first char from the group, and didn't
3671 nigel 93 set a required char, copy the latter from the former. If there are any
3672     forward reference subroutine calls in the group, there will be entries on
3673     the workspace list; replicate these with an appropriate increment. */
3674 nigel 77
3675     else
3676     {
3677     if (repeat_min > 1)
3678     {
3679 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3680 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3681     potential integer overflow. */
3682 nigel 93
3683     if (lengthptr != NULL)
3684 ph10 202 {
3685     int delta = (repeat_min - 1)*length_prevgroup;
3686     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3687     (double)INT_MAX ||
3688     OFLOW_MAX - *lengthptr < delta)
3689     {
3690     *errorcodeptr = ERR20;
3691     goto FAILED;
3692     }
3693     *lengthptr += delta;
3694     }
3695 nigel 93
3696     /* This is compiling for real */
3697    
3698     else
3699 nigel 77 {
3700 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3701     for (i = 1; i < repeat_min; i++)
3702     {
3703     uschar *hc;
3704     uschar *this_hwm = cd->hwm;
3705     memcpy(code, previous, len);
3706     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3707     {
3708     PUT(cd->hwm, 0, GET(hc, 0) + len);
3709     cd->hwm += LINK_SIZE;
3710     }
3711     save_hwm = this_hwm;
3712     code += len;
3713     }
3714 nigel 77 }
3715     }
3716 nigel 93
3717 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3718     }
3719    
3720     /* This code is common to both the zero and non-zero minimum cases. If
3721     the maximum is limited, it replicates the group in a nested fashion,
3722     remembering the bracket starts on a stack. In the case of a zero minimum,
3723     the first one was set up above. In all cases the repeat_max now specifies
3724 nigel 93 the number of additional copies needed. Again, we must remember to
3725     replicate entries on the forward reference list. */
3726 nigel 77
3727     if (repeat_max >= 0)
3728     {
3729 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3730     just adjust the length as if we had. For each repetition we must add 1
3731     to the length for BRAZERO and for all but the last repetition we must
3732 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3733     paranoid checks to avoid integer overflow. */
3734 nigel 93
3735     if (lengthptr != NULL && repeat_max > 0)
3736 ph10 202 {
3737     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3738     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3739     if ((double)repeat_max *
3740     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3741     > (double)INT_MAX ||
3742     OFLOW_MAX - *lengthptr < delta)
3743     {
3744     *errorcodeptr = ERR20;
3745     goto FAILED;
3746     }
3747     *lengthptr += delta;
3748     }
3749 nigel 93
3750     /* This is compiling for real */
3751    
3752     else for (i = repeat_max - 1; i >= 0; i--)
3753 nigel 77 {
3754 nigel 93 uschar *hc;
3755     uschar *this_hwm = cd->hwm;
3756    
3757 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3758    
3759     /* All but the final copy start a new nesting, maintaining the
3760     chain of brackets outstanding. */
3761    
3762     if (i != 0)
3763     {
3764     int offset;
3765     *code++ = OP_BRA;
3766     offset = (bralink == NULL)? 0 : code - bralink;
3767     bralink = code;
3768     PUTINC(code, 0, offset);
3769     }
3770    
3771     memcpy(code, previous, len);
3772 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3773     {
3774     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3775     cd->hwm += LINK_SIZE;
3776     }
3777     save_hwm = this_hwm;
3778 nigel 77 code += len;
3779     }
3780    
3781     /* Now chain through the pending brackets, and fill in their length
3782     fields (which are holding the chain links pro tem). */
3783    
3784     while (bralink != NULL)
3785     {
3786     int oldlinkoffset;
3787     int offset = code - bralink + 1;
3788     uschar *bra = code - offset;
3789     oldlinkoffset = GET(bra, 1);
3790     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3791     *code++ = OP_KET;
3792     PUTINC(code, 0, offset);
3793     PUT(bra, 1, offset);
3794     }
3795     }
3796    
3797     /* If the maximum is unlimited, set a repeater in the final copy. We
3798     can't just offset backwards from the current code point, because we
3799     don't know if there's been an options resetting after the ket. The
3800 nigel 93 correct offset was computed above.
3801 nigel 77
3802 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3803     this group is a non-atomic one that could match an empty string. If so,
3804     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3805     that runtime checking can be done. [This check is also applied to
3806     atomic groups at runtime, but in a different way.] */
3807    
3808     else
3809     {
3810     uschar *ketcode = code - ketoffset;
3811     uschar *bracode = ketcode - GET(ketcode, 1);
3812     *ketcode = OP_KETRMAX + repeat_type;
3813     if (lengthptr == NULL && *bracode != OP_ONCE)
3814     {
3815     uschar *scode = bracode;
3816     do
3817     {
3818     if (could_be_empty_branch(scode, ketcode, utf8))
3819     {
3820     *bracode += OP_SBRA - OP_BRA;
3821     break;
3822     }
3823     scode += GET(scode, 1);
3824     }
3825     while (*scode == OP_ALT);
3826     }
3827     }
3828 nigel 77 }
3829    
3830     /* Else there's some kind of shambles */
3831    
3832     else
3833     {
3834     *errorcodeptr = ERR11;
3835     goto FAILED;
3836     }
3837    
3838 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3839     tests above succeeded, possessive_quantifier is TRUE. For some of the
3840     simpler opcodes, there is an special alternative opcode for this. For
3841     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3842     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3843     but the special opcodes can optimize it a bit. The repeated item starts at
3844     tempcode, not at previous, which might be the first part of a string whose
3845     (former) last char we repeated.
3846 nigel 77
3847 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3848     an 'upto' may follow. We skip over an 'exact' item, and then test the
3849     length of what remains before proceeding. */
3850    
3851 nigel 77 if (possessive_quantifier)
3852     {
3853 nigel 93 int len;
3854     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3855     *tempcode == OP_NOTEXACT)
3856     tempcode += _pcre_OP_lengths[*tempcode];
3857     len = code - tempcode;
3858     if (len > 0) switch (*tempcode)
3859     {
3860     case OP_STAR: *tempcode = OP_POSSTAR; break;
3861     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3862     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3863     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3864    
3865     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3866     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3867     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3868     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3869    
3870     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3871     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3872     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3873     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3874    
3875     default:
3876     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3877     code += 1 + LINK_SIZE;
3878     len += 1 + LINK_SIZE;
3879     tempcode[0] = OP_ONCE;
3880     *code++ = OP_KET;
3881     PUTINC(code, 0, len);
3882     PUT(tempcode, 1, len);
3883     break;
3884     }
3885 nigel 77 }
3886    
3887     /* In all case we no longer have a previous item. We also set the
3888     "follows varying string" flag for subsequently encountered reqbytes if
3889     it isn't already set and we have just passed a varying length item. */
3890    
3891     END_REPEAT:
3892     previous = NULL;
3893     cd->req_varyopt |= reqvary;
3894     break;
3895    
3896    
3897 nigel 93 /* ===================================================================*/
3898     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3899     lookbehind or option setting or condition or all the other extended
3900 ph10 210 parenthesis forms. */
3901 nigel 77
3902     case '(':
3903     newoptions = options;
3904     skipbytes = 0;
3905 nigel 93 bravalue = OP_CBRA;
3906     save_hwm = cd->hwm;
3907 ph10 180 reset_bracount = FALSE;
3908 ph10 211
3909 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3910 ph10 211
3911 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3912     {
3913 ph10 211 int i, namelen;
3914 ph10 210 const uschar *name = ++ptr;
3915     previous = NULL;
3916     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3917     if (*ptr == ':')
3918     {
3919     *errorcodeptr = ERR59; /* Not supported */
3920 ph10 211 goto FAILED;
3921     }
3922 ph10 210 if (*ptr != ')')
3923     {
3924     *errorcodeptr = ERR60;
3925     goto FAILED;
3926     }
3927 ph10 211 namelen = ptr - name;
3928 ph10 210 for (i = 0; i < verbcount; i++)
3929 ph10 211 {
3930 ph10 210 if (namelen == verbs[i].len &&
3931     strncmp((char *)name, verbs[i].name, namelen) == 0)
3932     {
3933     *code = verbs[i].op;
3934     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3935     break;
3936 ph10 211 }
3937     }
3938     if (i < verbcount) continue;
3939 ph10 210 *errorcodeptr = ERR60;
3940 ph10 211 goto FAILED;
3941     }
3942    
3943 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
3944     appearance of any of them means that this is not a capturing group. */
3945 nigel 77
3946 ph10 210 else if (*ptr == '?')
3947 nigel 77 {
3948 nigel 93 int i, set, unset, namelen;
3949 nigel 77 int *optset;
3950 nigel 93 const uschar *name;
3951     uschar *slot;
3952 nigel 77
3953     switch (*(++ptr))
3954     {
3955     case '#': /* Comment; skip to ket */
3956     ptr++;
3957 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3958     if (*ptr == 0)
3959     {
3960     *errorcodeptr = ERR18;
3961     goto FAILED;
3962     }
3963 nigel 77 continue;
3964    
3965 nigel 93
3966     /* ------------------------------------------------------------ */
3967 ph10 175 case '|': /* Reset capture count for each branch */
3968     reset_bracount = TRUE;
3969 ph10 180 /* Fall through */
3970 ph10 175
3971     /* ------------------------------------------------------------ */
3972 nigel 93 case ':': /* Non-capturing bracket */
3973 nigel 77 bravalue = OP_BRA;
3974     ptr++;
3975     break;
3976    
3977 nigel 93
3978     /* ------------------------------------------------------------ */
3979 nigel 77 case '(':
3980     bravalue = OP_COND; /* Conditional group */
3981    
3982 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3983     group), a name (referring to a named group), or 'R', referring to
3984     recursion. R<digits> and R&name are also permitted for recursion tests.
3985 nigel 77
3986 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3987     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3988    
3989     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3990     be the recursive thing or the name 'R' (and similarly for 'R' followed
3991     by digits), and (b) a number could be a name that consists of digits.
3992     In both cases, we look for a name first; if not found, we try the other
3993     cases. */
3994    
3995     /* For conditions that are assertions, check the syntax, and then exit
3996     the switch. This will take control down to where bracketed groups,
3997     including assertions, are processed. */
3998    
3999     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4000     break;
4001    
4002     /* Most other conditions use OP_CREF (a couple change to OP_RREF
4003     below), and all need to skip 3 bytes at the start of the group. */
4004    
4005     code[1+LINK_SIZE] = OP_CREF;
4006     skipbytes = 3;
4007 ph10 172 refsign = -1;
4008 nigel 93
4009     /* Check for a test for recursion in a named group. */
4010    
4011     if (ptr[1] == 'R' && ptr[2] == '&')
4012 nigel 77 {
4013 nigel 93 terminator = -1;
4014     ptr += 2;
4015     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4016     }
4017 nigel 91
4018 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4019     syntax (?(<name>) or (?('name') */
4020 nigel 91
4021 nigel 93 else if (ptr[1] == '<')
4022     {
4023     terminator = '>';
4024     ptr++;
4025     }
4026     else if (ptr[1] == '\'')
4027     {
4028     terminator = '\'';
4029     ptr++;
4030     }
4031 ph10 172 else
4032 ph10 167 {
4033     terminator = 0;
4034 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4035     }
4036 nigel 77
4037 nigel 93 /* We now expect to read a name; any thing else is an error */
4038 nigel 77
4039 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4040     {
4041     ptr += 1; /* To get the right offset */
4042     *errorcodeptr = ERR28;
4043     goto FAILED;
4044     }
4045    
4046     /* Read the name, but also get it as a number if it's all digits */
4047    
4048     recno = 0;
4049     name = ++ptr;
4050     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4051     {
4052     if (recno >= 0)
4053     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4054     recno * 10 + *ptr - '0' : -1;
4055 nigel 91 ptr++;
4056 nigel 93 }
4057     namelen = ptr - name;
4058 nigel 91
4059 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4060     {
4061     ptr--; /* Error offset */
4062     *errorcodeptr = ERR26;
4063     goto FAILED;
4064     }
4065 nigel 91
4066 nigel 93 /* Do no further checking in the pre-compile phase. */
4067 nigel 91
4068 nigel 93 if (lengthptr != NULL) break;
4069 nigel 91
4070 nigel 93 /* In the real compile we do the work of looking for the actual
4071 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4072     be digits, in which case recno will be set. */
4073 ph10 172
4074 ph10 167 if (refsign > 0)
4075     {
4076     if (recno <= 0)
4077     {
4078     *errorcodeptr = ERR58;
4079     goto FAILED;
4080 ph10 172 }
4081 ph10 167 if (refsign == '-')
4082     {
4083 ph10 172 recno = cd->bracount - recno + 1;
4084 ph10 167 if (recno <= 0)
4085     {
4086     *errorcodeptr = ERR15;
4087     goto FAILED;
4088 ph10 172 }
4089 ph10 167 }
4090 ph10 172 else recno += cd->bracount;
4091 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
4092     break;
4093 ph10 172 }
4094 nigel 91
4095 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
4096     name. */
4097 ph10 172
4098 nigel 93 slot = cd->name_table;
4099     for (i = 0; i < cd->names_found; i++)
4100     {
4101     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4102     slot += cd->name_entry_size;
4103     }
4104 nigel 91
4105 nigel 93 /* Found a previous named subpattern */
4106 nigel 91
4107 nigel 93 if (i < cd->names_found)
4108     {
4109     recno = GET2(slot, 0);
4110     PUT2(code, 2+LINK_SIZE, recno);
4111     }
4112 nigel 91
4113 nigel 93 /* Search the pattern for a forward reference */