/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 218 - (hide annotations) (download)
Thu Aug 16 10:13:23 2007 UTC (5 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 189364 byte(s)
Fix several compile bugs involving repeated \p or \P items.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285 ph10 213 "(*VERB) not recognized",
286     "number is too big"
287 nigel 77 };
288    
289    
290     /* Table to identify digits and hex digits. This is used when compiling
291     patterns. Note that the tables in chartables are dependent on the locale, and
292     may mark arbitrary characters as digits - but the PCRE compiling code expects
293     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294     a private table here. It costs 256 bytes, but it is a lot faster than doing
295     character value tests (at least in some simple cases I timed), and in some
296     applications one wants PCRE to compile efficiently as well as match
297     efficiently.
298    
299     For convenience, we use the same bit definitions as in chartables:
300    
301     0x04 decimal digit
302     0x08 hexadecimal digit
303    
304     Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 nigel 77 static const unsigned char digitab[] =
308     {
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
343 nigel 77 static const unsigned char digitab[] =
344     {
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377    
378     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411     #endif
412    
413    
414     /* Definition to allow mutual recursion */
415    
416     static BOOL
417 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
419 nigel 77
420    
421    
422     /*************************************************
423     * Handle escapes *
424     *************************************************/
425    
426     /* This function is called when a \ has been encountered. It either returns a
427     positive value for a simple escape such as \n, or a negative value which
428 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
429     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431     ptr is pointing at the \. On exit, it is on the final character of the escape
432     sequence.
433 nigel 77
434     Arguments:
435     ptrptr points to the pattern position pointer
436     errorcodeptr points to the errorcode variable
437     bracount number of previous extracting brackets
438     options the options bits
439     isclass TRUE if inside a character class
440    
441     Returns: zero or positive => a data character
442     negative => a special escape sequence
443 ph10 213 on error, errorcodeptr is set
444 nigel 77 */
445    
446     static int
447     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448     int options, BOOL isclass)
449     {
450 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
451     const uschar *ptr = *ptrptr + 1;
452 nigel 77 int c, i;
453    
454 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455     ptr--; /* Set pointer back to the last byte */
456    
457 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
458    
459     if (c == 0) *errorcodeptr = ERR1;
460    
461     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462     a table. A non-zero result is something that can be returned immediately.
463     Otherwise further processing may be required. */
464    
465 ph10 97 #ifndef EBCDIC /* ASCII coding */
466 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
467     else if ((i = escapes[c - '0']) != 0) c = i;
468    
469 ph10 97 #else /* EBCDIC coding */
470 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471     else if ((i = escapes[c - 0x48]) != 0) c = i;
472     #endif
473    
474     /* Escapes that need further processing, or are illegal. */
475    
476     else
477     {
478     const uschar *oldptr;
479 nigel 93 BOOL braced, negated;
480    
481 nigel 77 switch (c)
482     {
483     /* A number of Perl escapes are not handled by PCRE. We give an explicit
484     error. */
485    
486     case 'l':
487     case 'L':
488     case 'N':
489     case 'u':
490     case 'U':
491     *errorcodeptr = ERR37;
492     break;
493    
494 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
495     is an absolute backreference. If negative, it is a relative backreference.
496 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497     reference to a named group. This is part of Perl's movement towards a
498     unified syntax for back references. As this is synonymous with \k{name}, we
499 ph10 171 fudge it up by pretending it really was \k. */
500 nigel 93
501     case 'g':
502     if (ptr[1] == '{')
503     {
504 ph10 171 const uschar *p;
505     for (p = ptr+2; *p != 0 && *p != '}'; p++)
506     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 ph10 172 if (*p != 0 && *p != '}')
508 ph10 171 {
509     c = -ESC_k;
510     break;
511 ph10 172 }
512 nigel 93 braced = TRUE;
513     ptr++;
514     }
515     else braced = FALSE;
516    
517     if (ptr[1] == '-')
518     {
519     negated = TRUE;
520     ptr++;
521     }
522     else negated = FALSE;
523    
524     c = 0;
525     while ((digitab[ptr[1]] & ctype_digit) != 0)
526     c = c * 10 + *(++ptr) - '0';
527 ph10 213
528     if (c < 0)
529     {
530     *errorcodeptr = ERR61;
531     break;
532     }
533 nigel 93
534     if (c == 0 || (braced && *(++ptr) != '}'))
535     {
536     *errorcodeptr = ERR57;
537 ph10 213 break;
538 nigel 93 }
539    
540     if (negated)
541     {
542     if (c > bracount)
543     {
544     *errorcodeptr = ERR15;
545 ph10 213 break;
546 nigel 93 }
547     c = bracount - (c - 1);
548     }
549    
550     c = -(ESC_REF + c);
551     break;
552    
553 nigel 77 /* The handling of escape sequences consisting of a string of digits
554     starting with one that is not zero is not straightforward. By experiment,
555     the way Perl works seems to be as follows:
556    
557     Outside a character class, the digits are read as a decimal number. If the
558     number is less than 10, or if there are that many previous extracting
559     left brackets, then it is a back reference. Otherwise, up to three octal
560     digits are read to form an escaped byte. Thus \123 is likely to be octal
561     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562     value is greater than 377, the least significant 8 bits are taken. Inside a
563     character class, \ followed by a digit is always an octal number. */
564    
565     case '1': case '2': case '3': case '4': case '5':
566     case '6': case '7': case '8': case '9':
567    
568     if (!isclass)
569     {
570     oldptr = ptr;
571     c -= '0';
572     while ((digitab[ptr[1]] & ctype_digit) != 0)
573     c = c * 10 + *(++ptr) - '0';
574 ph10 213 if (c < 0)
575     {
576     *errorcodeptr = ERR61;
577     break;
578     }
579 nigel 77 if (c < 10 || c <= bracount)
580     {
581     c = -(ESC_REF + c);
582     break;
583     }
584     ptr = oldptr; /* Put the pointer back and fall through */
585     }
586    
587     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588     generates a binary zero byte and treats the digit as a following literal.
589     Thus we have to pull back the pointer by one. */
590    
591     if ((c = *ptr) >= '8')
592     {
593     ptr--;
594     c = 0;
595     break;
596     }
597    
598     /* \0 always starts an octal number, but we may drop through to here with a
599 nigel 91 larger first octal digit. The original code used just to take the least
600     significant 8 bits of octal numbers (I think this is what early Perls used
601     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602     than 3 octal digits. */
603 nigel 77
604     case '0':
605     c -= '0';
606     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607     c = c * 8 + *(++ptr) - '0';
608 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 nigel 77 break;
610    
611 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
612     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613     treated as a data character. */
614 nigel 77
615     case 'x':
616 nigel 87 if (ptr[1] == '{')
617 nigel 77 {
618     const uschar *pt = ptr + 2;
619 nigel 87 int count = 0;
620    
621 nigel 77 c = 0;
622     while ((digitab[*pt] & ctype_xdigit) != 0)
623     {
624 nigel 87 register int cc = *pt++;
625     if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 nigel 77 count++;
627 nigel 87
628 ph10 97 #ifndef EBCDIC /* ASCII coding */
629 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 ph10 97 #else /* EBCDIC coding */
632 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 nigel 77 #endif
635     }
636 nigel 87
637 nigel 77 if (*pt == '}')
638     {
639 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 nigel 77 ptr = pt;
641     break;
642     }
643 nigel 87
644 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
645     recognize this construct; fall through to the normal \x handling. */
646     }
647    
648 nigel 87 /* Read just a single-byte hex-defined char */
649 nigel 77
650     c = 0;
651     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652     {
653     int cc; /* Some compilers don't like ++ */
654     cc = *(++ptr); /* in initializers */
655 ph10 97 #ifndef EBCDIC /* ASCII coding */
656 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 ph10 97 #else /* EBCDIC coding */
659 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
660     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661     #endif
662     }
663     break;
664    
665 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666     This coding is ASCII-specific, but then the whole concept of \cx is
667     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668 nigel 77
669     case 'c':
670     c = *(++ptr);
671     if (c == 0)
672     {
673     *errorcodeptr = ERR2;
674 ph10 213 break;
675 nigel 77 }
676    
677 ph10 97 #ifndef EBCDIC /* ASCII coding */
678 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
679     c ^= 0x40;
680 ph10 97 #else /* EBCDIC coding */
681 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
682     c ^= 0xC0;
683     #endif
684     break;
685    
686     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688     for Perl compatibility, it is a literal. This code looks a bit odd, but
689     there used to be some cases other than the default, and there may be again
690     in future, so I haven't "optimized" it. */
691    
692     default:
693     if ((options & PCRE_EXTRA) != 0) switch(c)
694     {
695     default:
696     *errorcodeptr = ERR3;
697     break;
698     }
699     break;
700     }
701     }
702    
703     *ptrptr = ptr;
704     return c;
705     }
706    
707    
708    
709     #ifdef SUPPORT_UCP
710     /*************************************************
711     * Handle \P and \p *
712     *************************************************/
713    
714     /* This function is called after \P or \p has been encountered, provided that
715     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716     pointing at the P or p. On exit, it is pointing at the final character of the
717     escape sequence.
718    
719     Argument:
720     ptrptr points to the pattern position pointer
721     negptr points to a boolean that is set TRUE for negation else FALSE
722 nigel 87 dptr points to an int that is set to the detailed property value
723 nigel 77 errorcodeptr points to the error code variable
724    
725 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
726 nigel 77 */
727    
728     static int
729 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 nigel 77 {
731     int c, i, bot, top;
732     const uschar *ptr = *ptrptr;
733 nigel 87 char name[32];
734 nigel 77
735     c = *(++ptr);
736     if (c == 0) goto ERROR_RETURN;
737    
738     *negptr = FALSE;
739    
740 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741     negation. */
742 nigel 77
743     if (c == '{')
744     {
745     if (ptr[1] == '^')
746     {
747     *negptr = TRUE;
748     ptr++;
749     }
750 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 nigel 77 {
752     c = *(++ptr);
753     if (c == 0) goto ERROR_RETURN;
754     if (c == '}') break;
755     name[i] = c;
756     }
757 nigel 87 if (c !='}') goto ERROR_RETURN;
758 nigel 77 name[i] = 0;
759     }
760    
761     /* Otherwise there is just one following character */
762    
763     else
764     {
765     name[0] = c;
766     name[1] = 0;
767     }
768    
769     *ptrptr = ptr;
770    
771     /* Search for a recognized property name using binary chop */
772    
773     bot = 0;
774     top = _pcre_utt_size;
775    
776     while (bot < top)
777     {
778 nigel 87 i = (bot + top) >> 1;
779 nigel 77 c = strcmp(name, _pcre_utt[i].name);
780 nigel 87 if (c == 0)
781     {
782     *dptr = _pcre_utt[i].value;
783     return _pcre_utt[i].type;
784     }
785 nigel 77 if (c > 0) bot = i + 1; else top = i;
786     }
787    
788     *errorcodeptr = ERR47;
789     *ptrptr = ptr;
790     return -1;
791    
792     ERROR_RETURN:
793     *errorcodeptr = ERR46;
794     *ptrptr = ptr;
795     return -1;
796     }
797     #endif
798    
799    
800    
801    
802     /*************************************************
803     * Check for counted repeat *
804     *************************************************/
805    
806     /* This function is called when a '{' is encountered in a place where it might
807     start a quantifier. It looks ahead to see if it really is a quantifier or not.
808     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809     where the ddds are digits.
810    
811     Arguments:
812     p pointer to the first char after '{'
813    
814     Returns: TRUE or FALSE
815     */
816    
817     static BOOL
818     is_counted_repeat(const uschar *p)
819     {
820     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821     while ((digitab[*p] & ctype_digit) != 0) p++;
822     if (*p == '}') return TRUE;
823    
824     if (*p++ != ',') return FALSE;
825     if (*p == '}') return TRUE;
826    
827     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828     while ((digitab[*p] & ctype_digit) != 0) p++;
829    
830     return (*p == '}');
831     }
832    
833    
834    
835     /*************************************************
836     * Read repeat counts *
837     *************************************************/
838    
839     /* Read an item of the form {n,m} and return the values. This is called only
840     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841     so the syntax is guaranteed to be correct, but we need to check the values.
842    
843     Arguments:
844     p pointer to first char after '{'
845     minp pointer to int for min
846     maxp pointer to int for max
847     returned as -1 if no max
848     errorcodeptr points to error code variable
849    
850     Returns: pointer to '}' on success;
851     current ptr on error, with errorcodeptr set non-zero
852     */
853    
854     static const uschar *
855     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856     {
857     int min = 0;
858     int max = -1;
859    
860 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
861     an integer overflow. */
862    
863 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 nigel 81 if (min < 0 || min > 65535)
865     {
866     *errorcodeptr = ERR5;
867     return p;
868     }
869 nigel 77
870 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
871     Also, max must not be less than min. */
872    
873 nigel 77 if (*p == '}') max = min; else
874     {
875     if (*(++p) != '}')
876     {
877     max = 0;
878     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 nigel 81 if (max < 0 || max > 65535)
880     {
881     *errorcodeptr = ERR5;
882     return p;
883     }
884 nigel 77 if (max < min)
885     {
886     *errorcodeptr = ERR4;
887     return p;
888     }
889     }
890     }
891    
892 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
893     '}'. */
894 nigel 77
895 nigel 81 *minp = min;
896     *maxp = max;
897 nigel 77 return p;
898     }
899    
900    
901    
902     /*************************************************
903 nigel 93 * Find forward referenced subpattern *
904 nigel 91 *************************************************/
905    
906 nigel 93 /* This function scans along a pattern's text looking for capturing
907     subpatterns, and counting them. If it finds a named pattern that matches the
908     name it is given, it returns its number. Alternatively, if the name is NULL, it
909     returns when it reaches a given numbered subpattern. This is used for forward
910     references to subpatterns. We know that if (?P< is encountered, the name will
911     be terminated by '>' because that is checked in the first pass.
912 nigel 91
913     Arguments:
914 nigel 93 ptr current position in the pattern
915     count current count of capturing parens so far encountered
916     name name to seek, or NULL if seeking a numbered subpattern
917     lorn name length, or subpattern number if name is NULL
918     xmode TRUE if we are in /x mode
919 nigel 91
920     Returns: the number of the named subpattern, or -1 if not found
921     */
922    
923     static int
924 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925     BOOL xmode)
926 nigel 91 {
927     const uschar *thisname;
928 nigel 93
929 nigel 91 for (; *ptr != 0; ptr++)
930     {
931 nigel 93 int term;
932    
933     /* Skip over backslashed characters and also entire \Q...\E */
934    
935     if (*ptr == '\\')
936     {
937     if (*(++ptr) == 0) return -1;
938     if (*ptr == 'Q') for (;;)
939     {
940     while (*(++ptr) != 0 && *ptr != '\\');
941     if (*ptr == 0) return -1;
942     if (*(++ptr) == 'E') break;
943     }
944     continue;
945     }
946    
947     /* Skip over character classes */
948    
949     if (*ptr == '[')
950     {
951     while (*(++ptr) != ']')
952     {
953 ph10 215 if (*ptr == 0) return -1;
954 nigel 93 if (*ptr == '\\')
955     {
956     if (*(++ptr) == 0) return -1;
957     if (*ptr == 'Q') for (;;)
958     {
959     while (*(++ptr) != 0 && *ptr != '\\');
960     if (*ptr == 0) return -1;
961     if (*(++ptr) == 'E') break;
962     }
963     continue;
964     }
965     }
966     continue;
967     }
968    
969     /* Skip comments in /x mode */
970    
971     if (xmode && *ptr == '#')
972     {
973     while (*(++ptr) != 0 && *ptr != '\n');
974     if (*ptr == 0) return -1;
975     continue;
976     }
977    
978     /* An opening parens must now be a real metacharacter */
979    
980 nigel 91 if (*ptr != '(') continue;
981 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
982 nigel 93 {
983     count++;
984     if (name == NULL && count == lorn) return count;
985     continue;
986     }
987    
988     ptr += 2;
989     if (*ptr == 'P') ptr++; /* Allow optional P */
990    
991     /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994     *ptr != '\'')
995     continue;
996    
997 nigel 91 count++;
998 nigel 93
999     if (name == NULL && count == lorn) return count;
1000     term = *ptr++;
1001     if (term == '<') term = '>';
1002 nigel 91 thisname = ptr;
1003 nigel 93 while (*ptr != term) ptr++;
1004     if (name != NULL && lorn == ptr - thisname &&
1005     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 nigel 91 return count;
1007     }
1008 nigel 93
1009 nigel 91 return -1;
1010     }
1011    
1012    
1013    
1014     /*************************************************
1015 nigel 77 * Find first significant op code *
1016     *************************************************/
1017    
1018     /* This is called by several functions that scan a compiled expression looking
1019     for a fixed first character, or an anchoring op code etc. It skips over things
1020     that do not influence this. For some calls, a change of option is important.
1021     For some calls, it makes sense to skip negative forward and all backward
1022     assertions, and also the \b assertion; for others it does not.
1023    
1024     Arguments:
1025     code pointer to the start of the group
1026     options pointer to external options
1027     optbit the option bit whose changing is significant, or
1028     zero if none are
1029     skipassert TRUE if certain assertions are to be skipped
1030    
1031     Returns: pointer to the first significant opcode
1032     */
1033    
1034     static const uschar*
1035     first_significant_code(const uschar *code, int *options, int optbit,
1036     BOOL skipassert)
1037     {
1038     for (;;)
1039     {
1040     switch ((int)*code)
1041     {
1042     case OP_OPT:
1043     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044     *options = (int)code[1];
1045     code += 2;
1046     break;
1047    
1048     case OP_ASSERT_NOT:
1049     case OP_ASSERTBACK:
1050     case OP_ASSERTBACK_NOT:
1051     if (!skipassert) return code;
1052     do code += GET(code, 1); while (*code == OP_ALT);
1053     code += _pcre_OP_lengths[*code];
1054     break;
1055    
1056     case OP_WORD_BOUNDARY:
1057     case OP_NOT_WORD_BOUNDARY:
1058     if (!skipassert) return code;
1059     /* Fall through */
1060    
1061     case OP_CALLOUT:
1062     case OP_CREF:
1063 nigel 93 case OP_RREF:
1064     case OP_DEF:
1065 nigel 77 code += _pcre_OP_lengths[*code];
1066     break;
1067    
1068     default:
1069     return code;
1070     }
1071     }
1072     /* Control never reaches here */
1073     }
1074    
1075    
1076    
1077    
1078     /*************************************************
1079     * Find the fixed length of a pattern *
1080     *************************************************/
1081    
1082     /* Scan a pattern and compute the fixed length of subject that will match it,
1083     if the length is fixed. This is needed for dealing with backward assertions.
1084     In UTF8 mode, the result is in characters rather than bytes.
1085    
1086     Arguments:
1087     code points to the start of the pattern (the bracket)
1088     options the compiling options
1089    
1090     Returns: the fixed length, or -1 if there is no fixed length,
1091     or -2 if \C was encountered
1092     */
1093    
1094     static int
1095     find_fixedlength(uschar *code, int options)
1096     {
1097     int length = -1;
1098    
1099     register int branchlength = 0;
1100     register uschar *cc = code + 1 + LINK_SIZE;
1101    
1102     /* Scan along the opcodes for this branch. If we get to the end of the
1103     branch, check the length against that of the other branches. */
1104    
1105     for (;;)
1106     {
1107     int d;
1108     register int op = *cc;
1109     switch (op)
1110     {
1111 nigel 93 case OP_CBRA:
1112 nigel 77 case OP_BRA:
1113     case OP_ONCE:
1114     case OP_COND:
1115 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 nigel 77 if (d < 0) return d;
1117     branchlength += d;
1118     do cc += GET(cc, 1); while (*cc == OP_ALT);
1119     cc += 1 + LINK_SIZE;
1120     break;
1121    
1122     /* Reached end of a branch; if it's a ket it is the end of a nested
1123     call. If it's ALT it is an alternation in a nested call. If it is
1124     END it's the end of the outer call. All can be handled by the same code. */
1125    
1126     case OP_ALT:
1127     case OP_KET:
1128     case OP_KETRMAX:
1129     case OP_KETRMIN:
1130     case OP_END:
1131     if (length < 0) length = branchlength;
1132     else if (length != branchlength) return -1;
1133     if (*cc != OP_ALT) return length;
1134     cc += 1 + LINK_SIZE;
1135     branchlength = 0;
1136     break;
1137    
1138     /* Skip over assertive subpatterns */
1139    
1140     case OP_ASSERT:
1141     case OP_ASSERT_NOT:
1142     case OP_ASSERTBACK:
1143     case OP_ASSERTBACK_NOT:
1144     do cc += GET(cc, 1); while (*cc == OP_ALT);
1145     /* Fall through */
1146    
1147     /* Skip over things that don't match chars */
1148    
1149     case OP_REVERSE:
1150     case OP_CREF:
1151 nigel 93 case OP_RREF:
1152     case OP_DEF:
1153 nigel 77 case OP_OPT:
1154     case OP_CALLOUT:
1155     case OP_SOD:
1156     case OP_SOM:
1157     case OP_EOD:
1158     case OP_EODN:
1159     case OP_CIRC:
1160     case OP_DOLL:
1161     case OP_NOT_WORD_BOUNDARY:
1162     case OP_WORD_BOUNDARY:
1163     cc += _pcre_OP_lengths[*cc];
1164     break;
1165    
1166     /* Handle literal characters */
1167    
1168     case OP_CHAR:
1169     case OP_CHARNC:
1170 nigel 91 case OP_NOT:
1171 nigel 77 branchlength++;
1172     cc += 2;
1173     #ifdef SUPPORT_UTF8
1174     if ((options & PCRE_UTF8) != 0)
1175     {
1176     while ((*cc & 0xc0) == 0x80) cc++;
1177     }
1178     #endif
1179     break;
1180    
1181     /* Handle exact repetitions. The count is already in characters, but we
1182     need to skip over a multibyte character in UTF8 mode. */
1183    
1184     case OP_EXACT:
1185     branchlength += GET2(cc,1);
1186     cc += 4;
1187     #ifdef SUPPORT_UTF8
1188     if ((options & PCRE_UTF8) != 0)
1189     {
1190     while((*cc & 0x80) == 0x80) cc++;
1191     }
1192     #endif
1193     break;
1194    
1195     case OP_TYPEEXACT:
1196     branchlength += GET2(cc,1);
1197 ph10 218 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 nigel 77 cc += 4;
1199     break;
1200    
1201     /* Handle single-char matchers */
1202    
1203     case OP_PROP:
1204     case OP_NOTPROP:
1205 nigel 87 cc += 2;
1206 nigel 77 /* Fall through */
1207    
1208     case OP_NOT_DIGIT:
1209     case OP_DIGIT:
1210     case OP_NOT_WHITESPACE:
1211     case OP_WHITESPACE:
1212     case OP_NOT_WORDCHAR:
1213     case OP_WORDCHAR:
1214     case OP_ANY:
1215     branchlength++;
1216     cc++;
1217     break;
1218    
1219     /* The single-byte matcher isn't allowed */
1220    
1221     case OP_ANYBYTE:
1222     return -2;
1223    
1224     /* Check a class for variable quantification */
1225    
1226     #ifdef SUPPORT_UTF8
1227     case OP_XCLASS:
1228     cc += GET(cc, 1) - 33;
1229     /* Fall through */
1230     #endif
1231    
1232     case OP_CLASS:
1233     case OP_NCLASS:
1234     cc += 33;
1235    
1236     switch (*cc)
1237     {
1238     case OP_CRSTAR:
1239     case OP_CRMINSTAR:
1240     case OP_CRQUERY:
1241     case OP_CRMINQUERY:
1242     return -1;
1243    
1244     case OP_CRRANGE:
1245     case OP_CRMINRANGE:
1246     if (GET2(cc,1) != GET2(cc,3)) return -1;
1247     branchlength += GET2(cc,1);
1248     cc += 5;
1249     break;
1250    
1251     default:
1252     branchlength++;
1253     }
1254     break;
1255    
1256     /* Anything else is variable length */
1257    
1258     default:
1259     return -1;
1260     }
1261     }
1262     /* Control never gets here */
1263     }
1264    
1265    
1266    
1267    
1268     /*************************************************
1269     * Scan compiled regex for numbered bracket *
1270     *************************************************/
1271    
1272     /* This little function scans through a compiled pattern until it finds a
1273     capturing bracket with the given number.
1274    
1275     Arguments:
1276     code points to start of expression
1277     utf8 TRUE in UTF-8 mode
1278     number the required bracket number
1279    
1280     Returns: pointer to the opcode for the bracket, or NULL if not found
1281     */
1282    
1283     static const uschar *
1284     find_bracket(const uschar *code, BOOL utf8, int number)
1285     {
1286     for (;;)
1287     {
1288     register int c = *code;
1289     if (c == OP_END) return NULL;
1290 nigel 91
1291     /* XCLASS is used for classes that cannot be represented just by a bit
1292     map. This includes negated single high-valued characters. The length in
1293     the table is zero; the actual length is stored in the compiled code. */
1294    
1295     if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297 nigel 93 /* Handle capturing bracket */
1298 nigel 91
1299 nigel 93 else if (c == OP_CBRA)
1300 nigel 77 {
1301 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1302 nigel 77 if (n == number) return (uschar *)code;
1303 nigel 93 code += _pcre_OP_lengths[c];
1304 nigel 77 }
1305 nigel 91
1306 ph10 218 /* Otherwise, we can get the item's length from the table, except that for
1307     repeated character types, we have to test for \p and \P, which have an extra
1308     two bytes of parameters. */
1309 nigel 91
1310 nigel 77 else
1311     {
1312 ph10 218 switch(c)
1313     {
1314     case OP_TYPESTAR:
1315     case OP_TYPEMINSTAR:
1316     case OP_TYPEPLUS:
1317     case OP_TYPEMINPLUS:
1318     case OP_TYPEQUERY:
1319     case OP_TYPEMINQUERY:
1320     case OP_TYPEUPTO:
1321     case OP_TYPEMINUPTO:
1322     case OP_TYPEEXACT:
1323     case OP_TYPEPOSSTAR:
1324     case OP_TYPEPOSPLUS:
1325     case OP_TYPEPOSQUERY:
1326     case OP_TYPEPOSUPTO:
1327     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1328     break;
1329     }
1330    
1331     /* Add in the fixed length from the table */
1332    
1333 nigel 77 code += _pcre_OP_lengths[c];
1334 ph10 218
1335     /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1336     a multi-byte character. The length in the table is a minimum, so we have to
1337     arrange to skip the extra bytes. */
1338    
1339 ph10 107 #ifdef SUPPORT_UTF8
1340 nigel 77 if (utf8) switch(c)
1341     {
1342     case OP_CHAR:
1343     case OP_CHARNC:
1344     case OP_EXACT:
1345     case OP_UPTO:
1346     case OP_MINUPTO:
1347 nigel 93 case OP_POSUPTO:
1348 nigel 77 case OP_STAR:
1349     case OP_MINSTAR:
1350 nigel 93 case OP_POSSTAR:
1351 nigel 77 case OP_PLUS:
1352     case OP_MINPLUS:
1353 nigel 93 case OP_POSPLUS:
1354 nigel 77 case OP_QUERY:
1355     case OP_MINQUERY:
1356 nigel 93 case OP_POSQUERY:
1357     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1358 nigel 77 break;
1359     }
1360 ph10 111 #endif
1361 nigel 77 }
1362     }
1363     }
1364    
1365    
1366    
1367     /*************************************************
1368     * Scan compiled regex for recursion reference *
1369     *************************************************/
1370    
1371     /* This little function scans through a compiled pattern until it finds an
1372     instance of OP_RECURSE.
1373    
1374     Arguments:
1375     code points to start of expression
1376     utf8 TRUE in UTF-8 mode
1377    
1378     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1379     */
1380    
1381     static const uschar *
1382     find_recurse(const uschar *code, BOOL utf8)
1383     {
1384     for (;;)
1385     {
1386     register int c = *code;
1387     if (c == OP_END) return NULL;
1388 nigel 91 if (c == OP_RECURSE) return code;
1389 ph10 218
1390 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1391     map. This includes negated single high-valued characters. The length in
1392     the table is zero; the actual length is stored in the compiled code. */
1393    
1394     if (c == OP_XCLASS) code += GET(code, 1);
1395    
1396 ph10 218 /* Otherwise, we can get the item's length from the table, except that for
1397     repeated character types, we have to test for \p and \P, which have an extra
1398     two bytes of parameters. */
1399 nigel 91
1400 nigel 77 else
1401     {
1402 ph10 218 switch(c)
1403     {
1404     case OP_TYPESTAR:
1405     case OP_TYPEMINSTAR:
1406     case OP_TYPEPLUS:
1407     case OP_TYPEMINPLUS:
1408     case OP_TYPEQUERY:
1409     case OP_TYPEMINQUERY:
1410     case OP_TYPEUPTO:
1411     case OP_TYPEMINUPTO:
1412     case OP_TYPEEXACT:
1413     case OP_TYPEPOSSTAR:
1414     case OP_TYPEPOSPLUS:
1415     case OP_TYPEPOSQUERY:
1416     case OP_TYPEPOSUPTO:
1417     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1418     break;
1419     }
1420    
1421     /* Add in the fixed length from the table */
1422    
1423 nigel 77 code += _pcre_OP_lengths[c];
1424 ph10 218
1425     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1426     by a multi-byte character. The length in the table is a minimum, so we have
1427     to arrange to skip the extra bytes. */
1428    
1429 ph10 107 #ifdef SUPPORT_UTF8
1430 nigel 77 if (utf8) switch(c)
1431     {
1432     case OP_CHAR:
1433     case OP_CHARNC:
1434     case OP_EXACT:
1435     case OP_UPTO:
1436     case OP_MINUPTO:
1437 nigel 93 case OP_POSUPTO:
1438 nigel 77 case OP_STAR:
1439     case OP_MINSTAR:
1440 nigel 93 case OP_POSSTAR:
1441 nigel 77 case OP_PLUS:
1442     case OP_MINPLUS:
1443 nigel 93 case OP_POSPLUS:
1444 nigel 77 case OP_QUERY:
1445     case OP_MINQUERY:
1446 nigel 93 case OP_POSQUERY:
1447     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1448 nigel 77 break;
1449     }
1450 ph10 111 #endif
1451 nigel 77 }
1452     }
1453     }
1454    
1455    
1456    
1457     /*************************************************
1458     * Scan compiled branch for non-emptiness *
1459     *************************************************/
1460    
1461     /* This function scans through a branch of a compiled pattern to see whether it
1462 nigel 93 can match the empty string or not. It is called from could_be_empty()
1463     below and from compile_branch() when checking for an unlimited repeat of a
1464     group that can match nothing. Note that first_significant_code() skips over
1465     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1466     struck an inner bracket whose current branch will already have been scanned.
1467 nigel 77
1468     Arguments:
1469     code points to start of search
1470     endcode points to where to stop
1471     utf8 TRUE if in UTF8 mode
1472    
1473     Returns: TRUE if what is matched could be empty
1474     */
1475    
1476     static BOOL
1477     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1478     {
1479     register int c;
1480 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1481 nigel 77 code < endcode;
1482     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1483     {
1484     const uschar *ccode;
1485    
1486     c = *code;
1487 ph10 172
1488 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1489 nigel 77
1490 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1491     {
1492 ph10 172 code += _pcre_OP_lengths[c];
1493 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1494     c = *code;
1495     continue;
1496     }
1497    
1498     /* For other groups, scan the branches. */
1499 ph10 172
1500 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1501 nigel 77 {
1502     BOOL empty_branch;
1503     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1504    
1505     /* Scan a closed bracket */
1506    
1507     empty_branch = FALSE;
1508     do
1509     {
1510     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1511     empty_branch = TRUE;
1512     code += GET(code, 1);
1513     }
1514     while (*code == OP_ALT);
1515     if (!empty_branch) return FALSE; /* All branches are non-empty */
1516 ph10 172 c = *code;
1517 nigel 93 continue;
1518 nigel 77 }
1519    
1520 nigel 93 /* Handle the other opcodes */
1521    
1522     switch (c)
1523 nigel 77 {
1524 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1525     cannot be represented just by a bit map. This includes negated single
1526     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1527     actual length is stored in the compiled code, so we must update "code"
1528     here. */
1529 nigel 77
1530     #ifdef SUPPORT_UTF8
1531     case OP_XCLASS:
1532 ph10 216 ccode = code += GET(code, 1);
1533 nigel 77 goto CHECK_CLASS_REPEAT;
1534     #endif
1535    
1536     case OP_CLASS:
1537     case OP_NCLASS:
1538     ccode = code + 33;
1539    
1540     #ifdef SUPPORT_UTF8
1541     CHECK_CLASS_REPEAT:
1542     #endif
1543    
1544     switch (*ccode)
1545     {
1546     case OP_CRSTAR: /* These could be empty; continue */
1547     case OP_CRMINSTAR:
1548     case OP_CRQUERY:
1549     case OP_CRMINQUERY:
1550     break;
1551    
1552     default: /* Non-repeat => class must match */
1553     case OP_CRPLUS: /* These repeats aren't empty */
1554     case OP_CRMINPLUS:
1555     return FALSE;
1556    
1557     case OP_CRRANGE:
1558     case OP_CRMINRANGE:
1559     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1560     break;
1561     }
1562     break;
1563    
1564     /* Opcodes that must match a character */
1565    
1566     case OP_PROP:
1567     case OP_NOTPROP:
1568     case OP_EXTUNI:
1569     case OP_NOT_DIGIT:
1570     case OP_DIGIT:
1571     case OP_NOT_WHITESPACE:
1572     case OP_WHITESPACE:
1573     case OP_NOT_WORDCHAR:
1574     case OP_WORDCHAR:
1575     case OP_ANY:
1576     case OP_ANYBYTE:
1577     case OP_CHAR:
1578     case OP_CHARNC:
1579     case OP_NOT:
1580     case OP_PLUS:
1581     case OP_MINPLUS:
1582 nigel 93 case OP_POSPLUS:
1583 nigel 77 case OP_EXACT:
1584     case OP_NOTPLUS:
1585     case OP_NOTMINPLUS:
1586 nigel 93 case OP_NOTPOSPLUS:
1587 nigel 77 case OP_NOTEXACT:
1588     case OP_TYPEPLUS:
1589     case OP_TYPEMINPLUS:
1590 nigel 93 case OP_TYPEPOSPLUS:
1591 nigel 77 case OP_TYPEEXACT:
1592     return FALSE;
1593    
1594     /* End of branch */
1595    
1596     case OP_KET:
1597     case OP_KETRMAX:
1598     case OP_KETRMIN:
1599     case OP_ALT:
1600     return TRUE;
1601    
1602 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1603     MINUPTO, and POSUPTO may be followed by a multibyte character */
1604 nigel 77
1605     #ifdef SUPPORT_UTF8
1606     case OP_STAR:
1607     case OP_MINSTAR:
1608 nigel 93 case OP_POSSTAR:
1609 nigel 77 case OP_QUERY:
1610     case OP_MINQUERY:
1611 nigel 93 case OP_POSQUERY:
1612 nigel 77 case OP_UPTO:
1613     case OP_MINUPTO:
1614 nigel 93 case OP_POSUPTO:
1615 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1616     break;
1617     #endif
1618     }
1619     }
1620    
1621     return TRUE;
1622     }
1623    
1624    
1625    
1626     /*************************************************
1627     * Scan compiled regex for non-emptiness *
1628     *************************************************/
1629    
1630     /* This function is called to check for left recursive calls. We want to check
1631     the current branch of the current pattern to see if it could match the empty
1632     string. If it could, we must look outwards for branches at other levels,
1633     stopping when we pass beyond the bracket which is the subject of the recursion.
1634    
1635     Arguments:
1636     code points to start of the recursion
1637     endcode points to where to stop (current RECURSE item)
1638     bcptr points to the chain of current (unclosed) branch starts
1639     utf8 TRUE if in UTF-8 mode
1640    
1641     Returns: TRUE if what is matched could be empty
1642     */
1643    
1644     static BOOL
1645     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1646     BOOL utf8)
1647     {
1648     while (bcptr != NULL && bcptr->current >= code)
1649     {
1650     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1651     bcptr = bcptr->outer;
1652     }
1653     return TRUE;
1654     }
1655    
1656    
1657    
1658     /*************************************************
1659     * Check for POSIX class syntax *
1660     *************************************************/
1661    
1662     /* This function is called when the sequence "[:" or "[." or "[=" is
1663     encountered in a character class. It checks whether this is followed by an
1664     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1665     ".]" or "=]".
1666    
1667     Argument:
1668     ptr pointer to the initial [
1669     endptr where to return the end pointer
1670     cd pointer to compile data
1671    
1672     Returns: TRUE or FALSE
1673     */
1674    
1675     static BOOL
1676     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1677     {
1678     int terminator; /* Don't combine these lines; the Solaris cc */
1679     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1680     if (*(++ptr) == '^') ptr++;
1681     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1682     if (*ptr == terminator && ptr[1] == ']')
1683     {
1684     *endptr = ptr;
1685     return TRUE;
1686     }
1687     return FALSE;
1688     }
1689    
1690    
1691    
1692    
1693     /*************************************************
1694     * Check POSIX class name *
1695     *************************************************/
1696    
1697     /* This function is called to check the name given in a POSIX-style class entry
1698     such as [:alnum:].
1699    
1700     Arguments:
1701     ptr points to the first letter
1702     len the length of the name
1703    
1704     Returns: a value representing the name, or -1 if unknown
1705     */
1706    
1707     static int
1708     check_posix_name(const uschar *ptr, int len)
1709     {
1710     register int yield = 0;
1711     while (posix_name_lengths[yield] != 0)
1712     {
1713     if (len == posix_name_lengths[yield] &&
1714     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1715     yield++;
1716     }
1717     return -1;
1718     }
1719    
1720    
1721     /*************************************************
1722     * Adjust OP_RECURSE items in repeated group *
1723     *************************************************/
1724    
1725     /* OP_RECURSE items contain an offset from the start of the regex to the group
1726     that is referenced. This means that groups can be replicated for fixed
1727     repetition simply by copying (because the recursion is allowed to refer to
1728     earlier groups that are outside the current group). However, when a group is
1729     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1730     it, after it has been compiled. This means that any OP_RECURSE items within it
1731     that refer to the group itself or any contained groups have to have their
1732 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1733     the partially compiled regex must be temporarily terminated with OP_END.
1734 nigel 77
1735 nigel 93 This function has been extended with the possibility of forward references for
1736     recursions and subroutine calls. It must also check the list of such references
1737     for the group we are dealing with. If it finds that one of the recursions in
1738     the current group is on this list, it adjusts the offset in the list, not the
1739     value in the reference (which is a group number).
1740    
1741 nigel 77 Arguments:
1742     group points to the start of the group
1743     adjust the amount by which the group is to be moved
1744     utf8 TRUE in UTF-8 mode
1745     cd contains pointers to tables etc.
1746 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1747 nigel 77
1748     Returns: nothing
1749     */
1750    
1751     static void
1752 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1753     uschar *save_hwm)
1754 nigel 77 {
1755     uschar *ptr = group;
1756     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1757     {
1758 nigel 93 int offset;
1759     uschar *hc;
1760    
1761     /* See if this recursion is on the forward reference list. If so, adjust the
1762     reference. */
1763    
1764     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1765     {
1766     offset = GET(hc, 0);
1767     if (cd->start_code + offset == ptr + 1)
1768     {
1769     PUT(hc, 0, offset + adjust);
1770     break;
1771     }
1772     }
1773    
1774     /* Otherwise, adjust the recursion offset if it's after the start of this
1775     group. */
1776    
1777     if (hc >= cd->hwm)
1778     {
1779     offset = GET(ptr, 1);
1780     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1781     }
1782    
1783 nigel 77 ptr += 1 + LINK_SIZE;
1784     }
1785     }
1786    
1787    
1788    
1789     /*************************************************
1790     * Insert an automatic callout point *
1791     *************************************************/
1792    
1793     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1794     callout points before each pattern item.
1795    
1796     Arguments:
1797     code current code pointer
1798     ptr current pattern pointer
1799     cd pointers to tables etc
1800    
1801     Returns: new code pointer
1802     */
1803    
1804     static uschar *
1805     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1806     {
1807     *code++ = OP_CALLOUT;
1808     *code++ = 255;
1809     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1810     PUT(code, LINK_SIZE, 0); /* Default length */
1811     return code + 2*LINK_SIZE;
1812     }
1813    
1814    
1815    
1816     /*************************************************
1817     * Complete a callout item *
1818     *************************************************/
1819    
1820     /* A callout item contains the length of the next item in the pattern, which
1821     we can't fill in till after we have reached the relevant point. This is used
1822     for both automatic and manual callouts.
1823    
1824     Arguments:
1825     previous_callout points to previous callout item
1826     ptr current pattern pointer
1827     cd pointers to tables etc
1828    
1829     Returns: nothing
1830     */
1831    
1832     static void
1833     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1834     {
1835     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1836     PUT(previous_callout, 2 + LINK_SIZE, length);
1837     }
1838    
1839    
1840    
1841     #ifdef SUPPORT_UCP
1842     /*************************************************
1843     * Get othercase range *
1844     *************************************************/
1845    
1846     /* This function is passed the start and end of a class range, in UTF-8 mode
1847     with UCP support. It searches up the characters, looking for internal ranges of
1848     characters in the "other" case. Each call returns the next one, updating the
1849     start address.
1850    
1851     Arguments:
1852     cptr points to starting character value; updated
1853     d end value
1854     ocptr where to put start of othercase range
1855     odptr where to put end of othercase range
1856    
1857     Yield: TRUE when range returned; FALSE when no more
1858     */
1859    
1860     static BOOL
1861 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1862     unsigned int *odptr)
1863 nigel 77 {
1864 nigel 93 unsigned int c, othercase, next;
1865 nigel 77
1866     for (c = *cptr; c <= d; c++)
1867 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1868 nigel 77
1869     if (c > d) return FALSE;
1870    
1871     *ocptr = othercase;
1872     next = othercase + 1;
1873    
1874     for (++c; c <= d; c++)
1875     {
1876 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1877 nigel 77 next++;
1878     }
1879    
1880     *odptr = next - 1;
1881     *cptr = c;
1882    
1883     return TRUE;
1884     }
1885     #endif /* SUPPORT_UCP */
1886    
1887    
1888 nigel 93
1889 nigel 77 /*************************************************
1890 nigel 93 * Check if auto-possessifying is possible *
1891     *************************************************/
1892    
1893     /* This function is called for unlimited repeats of certain items, to see
1894     whether the next thing could possibly match the repeated item. If not, it makes
1895     sense to automatically possessify the repeated item.
1896    
1897     Arguments:
1898     op_code the repeated op code
1899     this data for this item, depends on the opcode
1900     utf8 TRUE in UTF-8 mode
1901     utf8_char used for utf8 character bytes, NULL if not relevant
1902     ptr next character in pattern
1903     options options bits
1904     cd contains pointers to tables etc.
1905    
1906     Returns: TRUE if possessifying is wanted
1907     */
1908    
1909     static BOOL
1910     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1911     const uschar *ptr, int options, compile_data *cd)
1912     {
1913     int next;
1914    
1915     /* Skip whitespace and comments in extended mode */
1916    
1917     if ((options & PCRE_EXTENDED) != 0)
1918     {
1919     for (;;)
1920     {
1921     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1922     if (*ptr == '#')
1923     {
1924     while (*(++ptr) != 0)
1925     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1926     }
1927     else break;
1928     }
1929     }
1930    
1931     /* If the next item is one that we can handle, get its value. A non-negative
1932     value is a character, a negative value is an escape value. */
1933    
1934     if (*ptr == '\\')
1935     {
1936     int temperrorcode = 0;
1937     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1938     if (temperrorcode != 0) return FALSE;
1939     ptr++; /* Point after the escape sequence */
1940     }
1941    
1942     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1943     {
1944     #ifdef SUPPORT_UTF8
1945     if (utf8) { GETCHARINC(next, ptr); } else
1946     #endif
1947     next = *ptr++;
1948     }
1949    
1950     else return FALSE;
1951    
1952     /* Skip whitespace and comments in extended mode */
1953    
1954     if ((options & PCRE_EXTENDED) != 0)
1955     {
1956     for (;;)
1957     {
1958     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1959     if (*ptr == '#')
1960     {
1961     while (*(++ptr) != 0)
1962     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1963     }
1964     else break;
1965     }
1966     }
1967    
1968     /* If the next thing is itself optional, we have to give up. */
1969    
1970     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1971     return FALSE;
1972    
1973     /* Now compare the next item with the previous opcode. If the previous is a
1974     positive single character match, "item" either contains the character or, if
1975     "item" is greater than 127 in utf8 mode, the character's bytes are in
1976     utf8_char. */
1977    
1978    
1979     /* Handle cases when the next item is a character. */
1980    
1981     if (next >= 0) switch(op_code)
1982     {
1983     case OP_CHAR:
1984     #ifdef SUPPORT_UTF8
1985     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1986     #endif
1987     return item != next;
1988    
1989     /* For CHARNC (caseless character) we must check the other case. If we have
1990     Unicode property support, we can use it to test the other case of
1991     high-valued characters. */
1992    
1993     case OP_CHARNC:
1994     #ifdef SUPPORT_UTF8
1995     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1996     #endif
1997     if (item == next) return FALSE;
1998     #ifdef SUPPORT_UTF8
1999     if (utf8)
2000     {
2001     unsigned int othercase;
2002     if (next < 128) othercase = cd->fcc[next]; else
2003     #ifdef SUPPORT_UCP
2004     othercase = _pcre_ucp_othercase((unsigned int)next);
2005     #else
2006     othercase = NOTACHAR;
2007     #endif
2008     return (unsigned int)item != othercase;
2009     }
2010     else
2011     #endif /* SUPPORT_UTF8 */
2012     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2013    
2014     /* For OP_NOT, "item" must be a single-byte character. */
2015    
2016     case OP_NOT:
2017     if (next < 0) return FALSE; /* Not a character */
2018     if (item == next) return TRUE;
2019     if ((options & PCRE_CASELESS) == 0) return FALSE;
2020     #ifdef SUPPORT_UTF8
2021     if (utf8)
2022     {
2023     unsigned int othercase;
2024     if (next < 128) othercase = cd->fcc[next]; else
2025     #ifdef SUPPORT_UCP
2026     othercase = _pcre_ucp_othercase(next);
2027     #else
2028     othercase = NOTACHAR;
2029     #endif
2030     return (unsigned int)item == othercase;
2031     }
2032     else
2033     #endif /* SUPPORT_UTF8 */
2034     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2035    
2036     case OP_DIGIT:
2037     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2038    
2039     case OP_NOT_DIGIT:
2040     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2041    
2042     case OP_WHITESPACE:
2043     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2044    
2045     case OP_NOT_WHITESPACE:
2046     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2047    
2048     case OP_WORDCHAR:
2049     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2050    
2051     case OP_NOT_WORDCHAR:
2052     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2053    
2054 ph10 180 case OP_HSPACE:
2055     case OP_NOT_HSPACE:
2056     switch(next)
2057     {
2058     case 0x09:
2059     case 0x20:
2060     case 0xa0:
2061     case 0x1680:
2062     case 0x180e:
2063     case 0x2000:
2064     case 0x2001:
2065     case 0x2002:
2066     case 0x2003:
2067     case 0x2004:
2068     case 0x2005:
2069     case 0x2006:
2070     case 0x2007:
2071     case 0x2008:
2072     case 0x2009:
2073     case 0x200A:
2074     case 0x202f:
2075     case 0x205f:
2076     case 0x3000:
2077     return op_code != OP_HSPACE;
2078     default:
2079     return op_code == OP_HSPACE;
2080     }
2081    
2082     case OP_VSPACE:
2083     case OP_NOT_VSPACE:
2084     switch(next)
2085     {
2086     case 0x0a:
2087     case 0x0b:
2088     case 0x0c:
2089     case 0x0d:
2090     case 0x85:
2091     case 0x2028:
2092     case 0x2029:
2093     return op_code != OP_VSPACE;
2094     default:
2095     return op_code == OP_VSPACE;
2096     }
2097    
2098 nigel 93 default:
2099     return FALSE;
2100     }
2101    
2102    
2103     /* Handle the case when the next item is \d, \s, etc. */
2104    
2105     switch(op_code)
2106     {
2107     case OP_CHAR:
2108     case OP_CHARNC:
2109     #ifdef SUPPORT_UTF8
2110     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2111     #endif
2112     switch(-next)
2113     {
2114     case ESC_d:
2115     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2116    
2117     case ESC_D:
2118     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2119    
2120     case ESC_s:
2121     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2122    
2123     case ESC_S:
2124     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2125    
2126     case ESC_w:
2127     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2128    
2129     case ESC_W:
2130     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2131 ph10 182
2132 ph10 180 case ESC_h:
2133     case ESC_H:
2134     switch(item)
2135     {
2136     case 0x09:
2137     case 0x20:
2138     case 0xa0:
2139     case 0x1680:
2140     case 0x180e:
2141     case 0x2000:
2142     case 0x2001:
2143     case 0x2002:
2144     case 0x2003:
2145     case 0x2004:
2146     case 0x2005:
2147     case 0x2006:
2148     case 0x2007:
2149     case 0x2008:
2150     case 0x2009:
2151     case 0x200A:
2152     case 0x202f:
2153     case 0x205f:
2154     case 0x3000:
2155     return -next != ESC_h;
2156     default:
2157     return -next == ESC_h;
2158 ph10 182 }
2159    
2160 ph10 180 case ESC_v:
2161     case ESC_V:
2162     switch(item)
2163     {
2164     case 0x0a:
2165     case 0x0b:
2166     case 0x0c:
2167     case 0x0d:
2168     case 0x85:
2169     case 0x2028:
2170     case 0x2029:
2171     return -next != ESC_v;
2172     default:
2173     return -next == ESC_v;
2174 ph10 182 }
2175 nigel 93
2176     default:
2177     return FALSE;
2178     }
2179    
2180     case OP_DIGIT:
2181 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2182     next == -ESC_h || next == -ESC_v;
2183 nigel 93
2184     case OP_NOT_DIGIT:
2185     return next == -ESC_d;
2186    
2187     case OP_WHITESPACE:
2188     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2189    
2190     case OP_NOT_WHITESPACE:
2191 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2192 nigel 93
2193 ph10 180 case OP_HSPACE:
2194     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2195    
2196     case OP_NOT_HSPACE:
2197     return next == -ESC_h;
2198 ph10 182
2199 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2200 ph10 182 case OP_VSPACE:
2201 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2202    
2203     case OP_NOT_VSPACE:
2204 ph10 182 return next == -ESC_v;
2205 ph10 180
2206 nigel 93 case OP_WORDCHAR:
2207 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2208 nigel 93
2209     case OP_NOT_WORDCHAR:
2210     return next == -ESC_w || next == -ESC_d;
2211 ph10 182
2212 nigel 93 default:
2213     return FALSE;
2214     }
2215    
2216     /* Control does not reach here */
2217     }
2218    
2219    
2220    
2221     /*************************************************
2222 nigel 77 * Compile one branch *
2223     *************************************************/
2224    
2225 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2226 nigel 77 changed during the branch, the pointer is used to change the external options
2227 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2228     to find out the amount of memory needed, as well as during the real compile
2229     phase. The value of lengthptr distinguishes the two phases.
2230 nigel 77
2231     Arguments:
2232     optionsptr pointer to the option bits
2233     codeptr points to the pointer to the current code point
2234     ptrptr points to the current pattern pointer
2235     errorcodeptr points to error code variable
2236     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2237     reqbyteptr set to the last literal character required, else < 0
2238     bcptr points to current branch chain
2239     cd contains pointers to tables etc.
2240 nigel 93 lengthptr NULL during the real compile phase
2241     points to length accumulator during pre-compile phase
2242 nigel 77
2243     Returns: TRUE on success
2244     FALSE, with *errorcodeptr set non-zero on error
2245     */
2246    
2247     static BOOL
2248 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2249     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2250     compile_data *cd, int *lengthptr)
2251 nigel 77 {
2252     int repeat_type, op_type;
2253     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2254     int bravalue = 0;
2255     int greedy_default, greedy_non_default;
2256     int firstbyte, reqbyte;
2257     int zeroreqbyte, zerofirstbyte;
2258     int req_caseopt, reqvary, tempreqvary;
2259     int options = *optionsptr;
2260     int after_manual_callout = 0;
2261 nigel 93 int length_prevgroup = 0;
2262 nigel 77 register int c;
2263     register uschar *code = *codeptr;
2264 nigel 93 uschar *last_code = code;
2265     uschar *orig_code = code;
2266 nigel 77 uschar *tempcode;
2267     BOOL inescq = FALSE;
2268     BOOL groupsetfirstbyte = FALSE;
2269     const uschar *ptr = *ptrptr;
2270     const uschar *tempptr;
2271     uschar *previous = NULL;
2272     uschar *previous_callout = NULL;
2273 nigel 93 uschar *save_hwm = NULL;
2274 nigel 77 uschar classbits[32];
2275    
2276     #ifdef SUPPORT_UTF8
2277     BOOL class_utf8;
2278     BOOL utf8 = (options & PCRE_UTF8) != 0;
2279     uschar *class_utf8data;
2280     uschar utf8_char[6];
2281     #else
2282     BOOL utf8 = FALSE;
2283 nigel 93 uschar *utf8_char = NULL;
2284 nigel 77 #endif
2285    
2286 nigel 93 #ifdef DEBUG
2287     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2288     #endif
2289    
2290 nigel 77 /* Set up the default and non-default settings for greediness */
2291    
2292     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2293     greedy_non_default = greedy_default ^ 1;
2294    
2295     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2296     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2297     matches a non-fixed char first char; reqbyte just remains unset if we never
2298     find one.
2299    
2300     When we hit a repeat whose minimum is zero, we may have to adjust these values
2301     to take the zero repeat into account. This is implemented by setting them to
2302     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2303     item types that can be repeated set these backoff variables appropriately. */
2304    
2305     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2306    
2307     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2308     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2309     value > 255. It is added into the firstbyte or reqbyte variables to record the
2310     case status of the value. This is used only for ASCII characters. */
2311    
2312     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2313    
2314     /* Switch on next character until the end of the branch */
2315    
2316     for (;; ptr++)
2317     {
2318     BOOL negate_class;
2319     BOOL possessive_quantifier;
2320     BOOL is_quantifier;
2321 nigel 93 BOOL is_recurse;
2322 ph10 180 BOOL reset_bracount;
2323 nigel 77 int class_charcount;
2324     int class_lastchar;
2325     int newoptions;
2326     int recno;
2327 ph10 172 int refsign;
2328 nigel 77 int skipbytes;
2329     int subreqbyte;
2330     int subfirstbyte;
2331 nigel 93 int terminator;
2332 nigel 77 int mclength;
2333     uschar mcbuffer[8];
2334    
2335 nigel 93 /* Get next byte in the pattern */
2336 nigel 77
2337     c = *ptr;
2338    
2339 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2340     previous cycle of this loop. */
2341    
2342     if (lengthptr != NULL)
2343     {
2344     #ifdef DEBUG
2345     if (code > cd->hwm) cd->hwm = code; /* High water info */
2346     #endif
2347     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2348     {
2349     *errorcodeptr = ERR52;
2350     goto FAILED;
2351     }
2352    
2353     /* There is at least one situation where code goes backwards: this is the
2354     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2355     the class is simply eliminated. However, it is created first, so we have to
2356     allow memory for it. Therefore, don't ever reduce the length at this point.
2357     */
2358    
2359     if (code < last_code) code = last_code;
2360 ph10 202
2361     /* Paranoid check for integer overflow */
2362    
2363     if (OFLOW_MAX - *lengthptr < code - last_code)
2364     {
2365     *errorcodeptr = ERR20;
2366     goto FAILED;
2367     }
2368    
2369 nigel 93 *lengthptr += code - last_code;
2370     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2371    
2372     /* If "previous" is set and it is not at the start of the work space, move
2373     it back to there, in order to avoid filling up the work space. Otherwise,
2374     if "previous" is NULL, reset the current code pointer to the start. */
2375    
2376     if (previous != NULL)
2377     {
2378     if (previous > orig_code)
2379     {
2380     memmove(orig_code, previous, code - previous);
2381     code -= previous - orig_code;
2382     previous = orig_code;
2383     }
2384     }
2385     else code = orig_code;
2386    
2387     /* Remember where this code item starts so we can pick up the length
2388     next time round. */
2389    
2390     last_code = code;
2391     }
2392    
2393     /* In the real compile phase, just check the workspace used by the forward
2394     reference list. */
2395    
2396     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2397     {
2398     *errorcodeptr = ERR52;
2399     goto FAILED;
2400     }
2401    
2402 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2403    
2404     if (inescq && c != 0)
2405     {
2406     if (c == '\\' && ptr[1] == 'E')
2407     {
2408     inescq = FALSE;
2409     ptr++;
2410     continue;
2411     }
2412     else
2413     {
2414     if (previous_callout != NULL)
2415     {
2416 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2417     complete_callout(previous_callout, ptr, cd);
2418 nigel 77 previous_callout = NULL;
2419     }
2420     if ((options & PCRE_AUTO_CALLOUT) != 0)
2421     {
2422     previous_callout = code;
2423     code = auto_callout(code, ptr, cd);
2424     }
2425     goto NORMAL_CHAR;
2426     }
2427     }
2428    
2429     /* Fill in length of a previous callout, except when the next thing is
2430     a quantifier. */
2431    
2432     is_quantifier = c == '*' || c == '+' || c == '?' ||
2433     (c == '{' && is_counted_repeat(ptr+1));
2434    
2435     if (!is_quantifier && previous_callout != NULL &&
2436     after_manual_callout-- <= 0)
2437     {
2438 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2439     complete_callout(previous_callout, ptr, cd);
2440 nigel 77 previous_callout = NULL;
2441     }
2442    
2443     /* In extended mode, skip white space and comments */
2444    
2445     if ((options & PCRE_EXTENDED) != 0)
2446     {
2447     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2448     if (c == '#')
2449     {
2450 nigel 93 while (*(++ptr) != 0)
2451 nigel 91 {
2452 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2453 nigel 91 }
2454 nigel 93 if (*ptr != 0) continue;
2455    
2456 nigel 91 /* Else fall through to handle end of string */
2457     c = 0;
2458 nigel 77 }
2459     }
2460    
2461     /* No auto callout for quantifiers. */
2462    
2463     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2464     {
2465     previous_callout = code;
2466     code = auto_callout(code, ptr, cd);
2467     }
2468    
2469     switch(c)
2470     {
2471 nigel 93 /* ===================================================================*/
2472     case 0: /* The branch terminates at string end */
2473     case '|': /* or | or ) */
2474 nigel 77 case ')':
2475     *firstbyteptr = firstbyte;
2476     *reqbyteptr = reqbyte;
2477     *codeptr = code;
2478     *ptrptr = ptr;
2479 nigel 93 if (lengthptr != NULL)
2480     {
2481 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2482     {
2483     *errorcodeptr = ERR20;
2484     goto FAILED;
2485     }
2486 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2487     DPRINTF((">> end branch\n"));
2488     }
2489 nigel 77 return TRUE;
2490    
2491 nigel 93
2492     /* ===================================================================*/
2493 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2494     the setting of any following char as a first character. */
2495    
2496     case '^':
2497     if ((options & PCRE_MULTILINE) != 0)
2498     {
2499     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2500     }
2501     previous = NULL;
2502     *code++ = OP_CIRC;
2503     break;
2504    
2505     case '$':
2506     previous = NULL;
2507     *code++ = OP_DOLL;
2508     break;
2509    
2510     /* There can never be a first char if '.' is first, whatever happens about
2511     repeats. The value of reqbyte doesn't change either. */
2512    
2513     case '.':
2514     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2515     zerofirstbyte = firstbyte;
2516     zeroreqbyte = reqbyte;
2517     previous = code;
2518     *code++ = OP_ANY;
2519     break;
2520    
2521 nigel 93
2522     /* ===================================================================*/
2523 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2524     32-byte bitmap of the permitted characters, except in the special case
2525     where there is only one such character. For negated classes, we build the
2526     map as usual, then invert it at the end. However, we use a different opcode
2527     so that data characters > 255 can be handled correctly.
2528 nigel 77
2529     If the class contains characters outside the 0-255 range, a different
2530     opcode is compiled. It may optionally have a bit map for characters < 256,
2531     but those above are are explicitly listed afterwards. A flag byte tells
2532     whether the bitmap is present, and whether this is a negated class or not.
2533     */
2534    
2535     case '[':
2536     previous = code;
2537    
2538     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2539     they are encountered at the top level, so we'll do that too. */
2540    
2541     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2542     check_posix_syntax(ptr, &tempptr, cd))
2543     {
2544     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2545     goto FAILED;
2546     }
2547    
2548 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2549 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2550 ph10 205 skip them too. This makes for compatibility with Perl. */
2551 ph10 208
2552 ph10 205 negate_class = FALSE;
2553     for (;;)
2554 nigel 77 {
2555     c = *(++ptr);
2556 ph10 205 if (c == '\\')
2557     {
2558 ph10 208 if (ptr[1] == 'E') ptr++;
2559 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2560 ph10 208 else break;
2561 ph10 205 }
2562     else if (!negate_class && c == '^')
2563     negate_class = TRUE;
2564     else break;
2565 ph10 208 }
2566 nigel 77
2567     /* Keep a count of chars with values < 256 so that we can optimize the case
2568 nigel 93 of just a single character (as long as it's < 256). However, For higher
2569     valued UTF-8 characters, we don't yet do any optimization. */
2570 nigel 77
2571     class_charcount = 0;
2572     class_lastchar = -1;
2573    
2574 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2575     temporary bit of memory, in case the class contains only 1 character (less
2576     than 256), because in that case the compiled code doesn't use the bit map.
2577     */
2578    
2579     memset(classbits, 0, 32 * sizeof(uschar));
2580    
2581 nigel 77 #ifdef SUPPORT_UTF8
2582     class_utf8 = FALSE; /* No chars >= 256 */
2583 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2584 nigel 77 #endif
2585    
2586     /* Process characters until ] is reached. By writing this as a "do" it
2587 nigel 93 means that an initial ] is taken as a data character. At the start of the
2588     loop, c contains the first byte of the character. */
2589 nigel 77
2590 nigel 93 if (c != 0) do
2591 nigel 77 {
2592 nigel 93 const uschar *oldptr;
2593    
2594 nigel 77 #ifdef SUPPORT_UTF8
2595     if (utf8 && c > 127)
2596     { /* Braces are required because the */
2597     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2598     }
2599     #endif
2600    
2601     /* Inside \Q...\E everything is literal except \E */
2602    
2603     if (inescq)
2604     {
2605 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2606 nigel 77 {
2607 nigel 93 inescq = FALSE; /* Reset literal state */
2608     ptr++; /* Skip the 'E' */
2609     continue; /* Carry on with next */
2610 nigel 77 }
2611 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2612 nigel 77 }
2613    
2614     /* Handle POSIX class names. Perl allows a negation extension of the
2615     form [:^name:]. A square bracket that doesn't match the syntax is
2616     treated as a literal. We also recognize the POSIX constructions
2617     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2618     5.6 and 5.8 do. */
2619    
2620     if (c == '[' &&
2621     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2622     check_posix_syntax(ptr, &tempptr, cd))
2623     {
2624     BOOL local_negate = FALSE;
2625 nigel 87 int posix_class, taboffset, tabopt;
2626 nigel 77 register const uschar *cbits = cd->cbits;
2627 nigel 87 uschar pbits[32];
2628 nigel 77
2629     if (ptr[1] != ':')
2630     {
2631     *errorcodeptr = ERR31;
2632     goto FAILED;
2633     }
2634    
2635     ptr += 2;
2636     if (*ptr == '^')
2637     {
2638     local_negate = TRUE;
2639     ptr++;
2640     }
2641    
2642     posix_class = check_posix_name(ptr, tempptr - ptr);
2643     if (posix_class < 0)
2644     {
2645     *errorcodeptr = ERR30;
2646     goto FAILED;
2647     }
2648    
2649     /* If matching is caseless, upper and lower are converted to
2650     alpha. This relies on the fact that the class table starts with
2651     alpha, lower, upper as the first 3 entries. */
2652    
2653     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2654     posix_class = 0;
2655    
2656 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2657     because we may be adding and subtracting from it, and we don't want to
2658     subtract bits that may be in the main map already. At the end we or the
2659     result into the bit map that is being built. */
2660 nigel 77
2661     posix_class *= 3;
2662 nigel 87
2663     /* Copy in the first table (always present) */
2664    
2665     memcpy(pbits, cbits + posix_class_maps[posix_class],
2666     32 * sizeof(uschar));
2667    
2668     /* If there is a second table, add or remove it as required. */
2669    
2670     taboffset = posix_class_maps[posix_class + 1];
2671     tabopt = posix_class_maps[posix_class + 2];
2672    
2673     if (taboffset >= 0)
2674 nigel 77 {
2675 nigel 87 if (tabopt >= 0)
2676     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2677 nigel 77 else
2678 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2679 nigel 77 }
2680    
2681 nigel 87 /* Not see if we need to remove any special characters. An option
2682     value of 1 removes vertical space and 2 removes underscore. */
2683    
2684     if (tabopt < 0) tabopt = -tabopt;
2685     if (tabopt == 1) pbits[1] &= ~0x3c;
2686     else if (tabopt == 2) pbits[11] &= 0x7f;
2687    
2688     /* Add the POSIX table or its complement into the main table that is
2689     being built and we are done. */
2690    
2691     if (local_negate)
2692     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2693     else
2694     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2695    
2696 nigel 77 ptr = tempptr + 1;
2697     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2698     continue; /* End of POSIX syntax handling */
2699     }
2700    
2701     /* Backslash may introduce a single character, or it may introduce one
2702 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2703     case. Inside a class (and only there) it is treated as backspace.
2704     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2705 ph10 205 to 'or' into the one we are building. We assume they have more than one
2706 nigel 77 character in them, so set class_charcount bigger than one. */
2707    
2708     if (c == '\\')
2709     {
2710 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2711     if (*errorcodeptr != 0) goto FAILED;
2712 nigel 77
2713     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2714     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2715 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2716 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2717     {
2718     if (ptr[1] == '\\' && ptr[2] == 'E')
2719     {
2720     ptr += 2; /* avoid empty string */
2721     }
2722     else inescq = TRUE;
2723     continue;
2724     }
2725 ph10 218 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2726 nigel 77
2727     if (c < 0)
2728     {
2729     register const uschar *cbits = cd->cbits;
2730     class_charcount += 2; /* Greater than 1 is what matters */
2731 nigel 93
2732     /* Save time by not doing this in the pre-compile phase. */
2733    
2734     if (lengthptr == NULL) switch (-c)
2735 nigel 77 {
2736     case ESC_d:
2737     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2738     continue;
2739    
2740     case ESC_D:
2741     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2742     continue;
2743    
2744     case ESC_w:
2745     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2746     continue;
2747    
2748     case ESC_W:
2749     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2750     continue;
2751    
2752     case ESC_s:
2753     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2754     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2755     continue;
2756    
2757     case ESC_S:
2758     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2759     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2760     continue;
2761    
2762 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2763     continue;
2764 ph10 180
2765 nigel 93 default: /* Not recognized; fall through */
2766     break; /* Need "default" setting to stop compiler warning. */
2767     }
2768    
2769     /* In the pre-compile phase, just do the recognition. */
2770    
2771     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2772     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2773 ph10 180
2774 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2775     they use extra memory. */
2776 ph10 180
2777 ph10 178 if (-c == ESC_h)
2778     {
2779     SETBIT(classbits, 0x09); /* VT */
2780     SETBIT(classbits, 0x20); /* SPACE */
2781 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2782 ph10 178 #ifdef SUPPORT_UTF8
2783     if (utf8)
2784 ph10 180 {
2785 ph10 178 class_utf8 = TRUE;
2786     *class_utf8data++ = XCL_SINGLE;
2787 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2788 ph10 178 *class_utf8data++ = XCL_SINGLE;
2789 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2790     *class_utf8data++ = XCL_RANGE;
2791     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2792     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2793 ph10 178 *class_utf8data++ = XCL_SINGLE;
2794 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2795 ph10 178 *class_utf8data++ = XCL_SINGLE;
2796 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2797 ph10 178 *class_utf8data++ = XCL_SINGLE;
2798 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2799     }
2800     #endif
2801     continue;
2802     }
2803 nigel 93
2804 ph10 178 if (-c == ESC_H)
2805     {
2806     for (c = 0; c < 32; c++)
2807     {
2808     int x = 0xff;
2809     switch (c)
2810 ph10 180 {
2811 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2812     case 0x20/8: x ^= 1 << (0x20%8); break;
2813     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2814     default: break;
2815     }
2816     classbits[c] |= x;
2817 ph10 180 }
2818    
2819 ph10 178 #ifdef SUPPORT_UTF8
2820     if (utf8)
2821 ph10 180 {
2822 ph10 178 class_utf8 = TRUE;
2823 ph10 180 *class_utf8data++ = XCL_RANGE;
2824     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2825     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2826     *class_utf8data++ = XCL_RANGE;
2827     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2828     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2829     *class_utf8data++ = XCL_RANGE;
2830     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2831     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2832     *class_utf8data++ = XCL_RANGE;
2833     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2834     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2835     *class_utf8data++ = XCL_RANGE;
2836     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2837     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2838     *class_utf8data++ = XCL_RANGE;
2839     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2840     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2841     *class_utf8data++ = XCL_RANGE;
2842     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2843     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844     }
2845     #endif
2846     continue;
2847     }
2848 ph10 178
2849     if (-c == ESC_v)
2850     {
2851     SETBIT(classbits, 0x0a); /* LF */
2852     SETBIT(classbits, 0x0b); /* VT */
2853 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2854     SETBIT(classbits, 0x0d); /* CR */
2855     SETBIT(classbits, 0x85); /* NEL */
2856 ph10 178 #ifdef SUPPORT_UTF8
2857     if (utf8)
2858 ph10 180 {
2859 ph10 178 class_utf8 = TRUE;
2860 ph10 180 *class_utf8data++ = XCL_RANGE;
2861     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2862     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2863     }
2864     #endif
2865     continue;
2866     }
2867 ph10 178
2868     if (-c == ESC_V)
2869     {
2870     for (c = 0; c < 32; c++)
2871     {
2872     int x = 0xff;
2873     switch (c)
2874 ph10 180 {
2875 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2876     x ^= 1 << (0x0b%8);
2877     x ^= 1 << (0x0c%8);
2878 ph10 180 x ^= 1 << (0x0d%8);
2879 ph10 178 break;
2880     case 0x85/8: x ^= 1 << (0x85%8); break;
2881     default: break;
2882     }
2883     classbits[c] |= x;
2884 ph10 180 }
2885    
2886 ph10 178 #ifdef SUPPORT_UTF8
2887     if (utf8)
2888 ph10 180 {
2889 ph10 178 class_utf8 = TRUE;
2890 ph10 180 *class_utf8data++ = XCL_RANGE;
2891     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2892     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2893     *class_utf8data++ = XCL_RANGE;
2894     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2895     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2896     }
2897     #endif
2898     continue;
2899     }
2900 ph10 178
2901 nigel 93 /* We need to deal with \P and \p in both phases. */
2902    
2903 nigel 77 #ifdef SUPPORT_UCP
2904 nigel 93 if (-c == ESC_p || -c == ESC_P)
2905     {
2906     BOOL negated;
2907     int pdata;
2908     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2909     if (ptype < 0) goto FAILED;
2910     class_utf8 = TRUE;
2911     *class_utf8data++ = ((-c == ESC_p) != negated)?
2912     XCL_PROP : XCL_NOTPROP;
2913     *class_utf8data++ = ptype;
2914     *class_utf8data++ = pdata;
2915     class_charcount -= 2; /* Not a < 256 character */
2916 nigel 77 continue;
2917 nigel 93 }
2918 nigel 77 #endif
2919 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2920     strict mode. By default, for compatibility with Perl, they are
2921     treated as literals. */
2922 nigel 77
2923 nigel 93 if ((options & PCRE_EXTRA) != 0)
2924     {
2925     *errorcodeptr = ERR7;
2926     goto FAILED;
2927     }
2928 nigel 77
2929 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2930     c = *ptr; /* Get the final character and fall through */
2931 nigel 77 }
2932    
2933     /* Fall through if we have a single character (c >= 0). This may be
2934 nigel 93 greater than 256 in UTF-8 mode. */
2935 nigel 77
2936     } /* End of backslash handling */
2937    
2938     /* A single character may be followed by '-' to form a range. However,
2939     Perl does not permit ']' to be the end of the range. A '-' character
2940 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2941     entirely. The code for handling \Q and \E is messy. */
2942 nigel 77
2943 nigel 93 CHECK_RANGE:
2944     while (ptr[1] == '\\' && ptr[2] == 'E')
2945 nigel 77 {
2946 nigel 93 inescq = FALSE;
2947     ptr += 2;
2948     }
2949    
2950     oldptr = ptr;
2951    
2952     if (!inescq && ptr[1] == '-')
2953     {
2954 nigel 77 int d;
2955     ptr += 2;
2956 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2957 nigel 77
2958 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2959     mode. */
2960    
2961     while (*ptr == '\\' && ptr[1] == 'Q')
2962     {
2963     ptr += 2;
2964     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2965     inescq = TRUE;
2966     break;
2967     }
2968    
2969     if (*ptr == 0 || (!inescq && *ptr == ']'))
2970     {
2971     ptr = oldptr;
2972     goto LONE_SINGLE_CHARACTER;
2973     }
2974    
2975 nigel 77 #ifdef SUPPORT_UTF8
2976     if (utf8)
2977     { /* Braces are required because the */
2978     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2979     }
2980     else
2981     #endif
2982     d = *ptr; /* Not UTF-8 mode */
2983    
2984     /* The second part of a range can be a single-character escape, but
2985     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2986     in such circumstances. */
2987    
2988 nigel 93 if (!inescq && d == '\\')
2989 nigel 77 {
2990 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2991     if (*errorcodeptr != 0) goto FAILED;
2992 nigel 77
2993 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2994     special means the '-' was literal */
2995 nigel 77
2996     if (d < 0)
2997     {
2998     if (d == -ESC_b) d = '\b';
2999 nigel 93 else if (d == -ESC_X) d = 'X';
3000     else if (d == -ESC_R) d = 'R'; else
3001 nigel 77 {
3002 nigel 93 ptr = oldptr;
3003 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3004     }
3005     }
3006     }
3007    
3008 nigel 93 /* Check that the two values are in the correct order. Optimize
3009     one-character ranges */
3010 nigel 77
3011 nigel 93 if (d < c)
3012     {
3013     *errorcodeptr = ERR8;
3014     goto FAILED;
3015     }
3016    
3017 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3018    
3019     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3020     matching, we have to use an XCLASS with extra data items. Caseless
3021     matching for characters > 127 is available only if UCP support is
3022     available. */
3023    
3024     #ifdef SUPPORT_UTF8
3025     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3026     {
3027     class_utf8 = TRUE;
3028    
3029     /* With UCP support, we can find the other case equivalents of
3030     the relevant characters. There may be several ranges. Optimize how
3031     they fit with the basic range. */
3032    
3033     #ifdef SUPPORT_UCP
3034     if ((options & PCRE_CASELESS) != 0)
3035     {
3036 nigel 93 unsigned int occ, ocd;
3037     unsigned int cc = c;
3038     unsigned int origd = d;
3039 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3040     {
3041 ph10 180 if (occ >= (unsigned int)c &&
3042     ocd <= (unsigned int)d)
3043 ph10 176 continue; /* Skip embedded ranges */
3044 nigel 77
3045 ph10 180 if (occ < (unsigned int)c &&
3046 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3047 nigel 77 { /* if there is overlap, */
3048     c = occ; /* noting that if occ < c */
3049     continue; /* we can't have ocd > d */
3050     } /* because a subrange is */
3051 ph10 180 if (ocd > (unsigned int)d &&
3052 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3053 nigel 77 { /* the basic range. */
3054     d = ocd;
3055     continue;
3056     }
3057    
3058     if (occ == ocd)
3059     {
3060     *class_utf8data++ = XCL_SINGLE;
3061     }
3062     else
3063     {
3064     *class_utf8data++ = XCL_RANGE;
3065     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3066     }
3067     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3068     }
3069     }
3070     #endif /* SUPPORT_UCP */
3071    
3072     /* Now record the original range, possibly modified for UCP caseless
3073     overlapping ranges. */
3074    
3075     *class_utf8data++ = XCL_RANGE;
3076     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3077     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3078    
3079     /* With UCP support, we are done. Without UCP support, there is no
3080     caseless matching for UTF-8 characters > 127; we can use the bit map
3081     for the smaller ones. */
3082    
3083     #ifdef SUPPORT_UCP
3084     continue; /* With next character in the class */
3085     #else
3086     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3087    
3088     /* Adjust upper limit and fall through to set up the map */
3089    
3090     d = 127;
3091    
3092     #endif /* SUPPORT_UCP */
3093     }
3094     #endif /* SUPPORT_UTF8 */
3095    
3096     /* We use the bit map for all cases when not in UTF-8 mode; else
3097     ranges that lie entirely within 0-127 when there is UCP support; else
3098     for partial ranges without UCP support. */
3099    
3100 nigel 93 class_charcount += d - c + 1;
3101     class_lastchar = d;
3102    
3103     /* We can save a bit of time by skipping this in the pre-compile. */
3104    
3105     if (lengthptr == NULL) for (; c <= d; c++)
3106 nigel 77 {
3107     classbits[c/8] |= (1 << (c&7));
3108     if ((options & PCRE_CASELESS) != 0)
3109     {
3110     int uc = cd->fcc[c]; /* flip case */
3111     classbits[uc/8] |= (1 << (uc&7));
3112     }
3113     }
3114    
3115     continue; /* Go get the next char in the class */
3116     }
3117    
3118     /* Handle a lone single character - we can get here for a normal
3119     non-escape char, or after \ that introduces a single character or for an
3120     apparent range that isn't. */
3121    
3122     LONE_SINGLE_CHARACTER:
3123    
3124     /* Handle a character that cannot go in the bit map */
3125    
3126     #ifdef SUPPORT_UTF8
3127     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3128     {
3129     class_utf8 = TRUE;
3130     *class_utf8data++ = XCL_SINGLE;
3131     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3132    
3133     #ifdef SUPPORT_UCP
3134     if ((options & PCRE_CASELESS) != 0)
3135     {
3136 nigel 93 unsigned int othercase;
3137     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3138 nigel 77 {
3139     *class_utf8data++ = XCL_SINGLE;
3140     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3141     }
3142     }
3143     #endif /* SUPPORT_UCP */
3144    
3145     }
3146     else
3147     #endif /* SUPPORT_UTF8 */
3148    
3149     /* Handle a single-byte character */
3150     {
3151     classbits[c/8] |= (1 << (c&7));
3152     if ((options & PCRE_CASELESS) != 0)
3153     {
3154     c = cd->fcc[c]; /* flip case */
3155     classbits[c/8] |= (1 << (c&7));
3156     }
3157     class_charcount++;
3158     class_lastchar = c;
3159     }
3160     }
3161    
3162 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3163 nigel 77
3164 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3165 nigel 77
3166 nigel 93 if (c == 0) /* Missing terminating ']' */
3167     {
3168     *errorcodeptr = ERR6;
3169     goto FAILED;
3170     }
3171 ph10 208
3172 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3173     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3174     can optimize the negative case only if there were no characters >= 128
3175     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3176     single-bytes only. This is an historical hangover. Maybe one day we can
3177     tidy these opcodes to handle multi-byte characters.
3178    
3179     The optimization throws away the bit map. We turn the item into a
3180     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3181     that OP_NOT does not support multibyte characters. In the positive case, it
3182     can cause firstbyte to be set. Otherwise, there can be no first char if
3183     this item is first, whatever repeat count may follow. In the case of
3184     reqbyte, save the previous value for reinstating. */
3185    
3186     #ifdef SUPPORT_UTF8
3187     if (class_charcount == 1 &&
3188     (!utf8 ||
3189     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3190    
3191     #else
3192     if (class_charcount == 1)
3193     #endif
3194     {
3195     zeroreqbyte = reqbyte;
3196    
3197     /* The OP_NOT opcode works on one-byte characters only. */
3198    
3199     if (negate_class)
3200     {
3201     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3202     zerofirstbyte = firstbyte;
3203     *code++ = OP_NOT;
3204     *code++ = class_lastchar;
3205     break;
3206     }
3207    
3208     /* For a single, positive character, get the value into mcbuffer, and
3209     then we can handle this with the normal one-character code. */
3210    
3211     #ifdef SUPPORT_UTF8
3212     if (utf8 && class_lastchar > 127)
3213     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3214     else
3215     #endif
3216     {
3217     mcbuffer[0] = class_lastchar;
3218     mclength = 1;
3219     }
3220     goto ONE_CHAR;
3221     } /* End of 1-char optimization */
3222    
3223     /* The general case - not the one-char optimization. If this is the first
3224     thing in the branch, there can be no first char setting, whatever the
3225     repeat count. Any reqbyte setting must remain unchanged after any kind of
3226     repeat. */
3227    
3228     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3229     zerofirstbyte = firstbyte;
3230     zeroreqbyte = reqbyte;
3231    
3232     /* If there are characters with values > 255, we have to compile an
3233     extended class, with its own opcode. If there are no characters < 256,
3234 nigel 93 we can omit the bitmap in the actual compiled code. */
3235 nigel 77
3236     #ifdef SUPPORT_UTF8
3237     if (class_utf8)
3238     {
3239     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3240     *code++ = OP_XCLASS;
3241     code += LINK_SIZE;
3242     *code = negate_class? XCL_NOT : 0;
3243    
3244 nigel 93 /* If the map is required, move up the extra data to make room for it;
3245     otherwise just move the code pointer to the end of the extra data. */
3246 nigel 77
3247     if (class_charcount > 0)
3248     {
3249     *code++ |= XCL_MAP;
3250 nigel 93 memmove(code + 32, code, class_utf8data - code);
3251 nigel 77 memcpy(code, classbits, 32);
3252 nigel 93 code = class_utf8data + 32;
3253 nigel 77 }
3254 nigel 93 else code = class_utf8data;
3255 nigel 77
3256     /* Now fill in the complete length of the item */
3257    
3258     PUT(previous, 1, code - previous);
3259     break; /* End of class handling */
3260     }
3261     #endif
3262    
3263     /* If there are no characters > 255, negate the 32-byte map if necessary,
3264     and copy it into the code vector. If this is the first thing in the branch,
3265     there can be no first char setting, whatever the repeat count. Any reqbyte
3266     setting must remain unchanged after any kind of repeat. */
3267    
3268     if (negate_class)
3269     {
3270     *code++ = OP_NCLASS;
3271 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3272     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3273 nigel 77 }
3274     else
3275     {
3276     *code++ = OP_CLASS;
3277     memcpy(code, classbits, 32);
3278     }
3279     code += 32;
3280     break;
3281    
3282 nigel 93
3283     /* ===================================================================*/
3284 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3285     has been tested above. */
3286    
3287     case '{':
3288     if (!is_quantifier) goto NORMAL_CHAR;
3289     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3290     if (*errorcodeptr != 0) goto FAILED;
3291     goto REPEAT;
3292    
3293     case '*':
3294     repeat_min = 0;
3295     repeat_max = -1;
3296     goto REPEAT;
3297    
3298     case '+':
3299     repeat_min = 1;
3300     repeat_max = -1;
3301     goto REPEAT;
3302    
3303     case '?':
3304     repeat_min = 0;
3305     repeat_max = 1;
3306    
3307     REPEAT:
3308     if (previous == NULL)
3309     {
3310     *errorcodeptr = ERR9;
3311     goto FAILED;
3312     }
3313    
3314     if (repeat_min == 0)
3315     {
3316     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3317     reqbyte = zeroreqbyte; /* Ditto */
3318     }
3319    
3320     /* Remember whether this is a variable length repeat */
3321    
3322     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3323    
3324     op_type = 0; /* Default single-char op codes */
3325     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3326    
3327     /* Save start of previous item, in case we have to move it up to make space
3328     for an inserted OP_ONCE for the additional '+' extension. */
3329    
3330     tempcode = previous;
3331    
3332     /* If the next character is '+', we have a possessive quantifier. This
3333     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3334     If the next character is '?' this is a minimizing repeat, by default,
3335     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3336     repeat type to the non-default. */
3337    
3338     if (ptr[1] == '+')
3339     {
3340     repeat_type = 0; /* Force greedy */
3341     possessive_quantifier = TRUE;
3342     ptr++;
3343     }
3344     else if (ptr[1] == '?')
3345     {
3346     repeat_type = greedy_non_default;
3347     ptr++;
3348     }
3349     else repeat_type = greedy_default;
3350    
3351     /* If previous was a character match, abolish the item and generate a
3352     repeat item instead. If a char item has a minumum of more than one, ensure
3353     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3354     the first thing in a branch because the x will have gone into firstbyte
3355     instead. */
3356    
3357     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3358     {
3359     /* Deal with UTF-8 characters that take up more than one byte. It's
3360     easier to write this out separately than try to macrify it. Use c to
3361     hold the length of the character in bytes, plus 0x80 to flag that it's a
3362     length rather than a small character. */
3363    
3364     #ifdef SUPPORT_UTF8
3365     if (utf8 && (code[-1] & 0x80) != 0)
3366     {
3367     uschar *lastchar = code - 1;
3368     while((*lastchar & 0xc0) == 0x80) lastchar--;
3369     c = code - lastchar; /* Length of UTF-8 character */
3370     memcpy(utf8_char, lastchar, c); /* Save the char */
3371     c |= 0x80; /* Flag c as a length */
3372     }
3373     else
3374     #endif
3375    
3376     /* Handle the case of a single byte - either with no UTF8 support, or
3377     with UTF-8 disabled, or for a UTF-8 character < 128. */
3378    
3379     {
3380     c = code[-1];
3381     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3382     }
3383    
3384 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3385     the line is something that cannot possibly match this character. If so,
3386     automatically possessifying this item gains some performance in the case
3387     where the match fails. */
3388    
3389     if (!possessive_quantifier &&
3390     repeat_max < 0 &&
3391     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3392     options, cd))
3393     {
3394     repeat_type = 0; /* Force greedy */
3395     possessive_quantifier = TRUE;
3396     }
3397    
3398 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3399     }
3400    
3401     /* If previous was a single negated character ([^a] or similar), we use
3402     one of the special opcodes, replacing it. The code is shared with single-
3403     character repeats by setting opt_type to add a suitable offset into
3404 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3405     currently used only for single-byte chars. */
3406 nigel 77
3407     else if (*previous == OP_NOT)
3408     {
3409     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3410     c = previous[1];
3411 nigel 93 if (!possessive_quantifier &&
3412     repeat_max < 0 &&
3413     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3414     {
3415     repeat_type = 0; /* Force greedy */
3416     possessive_quantifier = TRUE;
3417     }
3418 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3419     }
3420    
3421     /* If previous was a character type match (\d or similar), abolish it and
3422     create a suitable repeat item. The code is shared with single-character
3423     repeats by setting op_type to add a suitable offset into repeat_type. Note
3424     the the Unicode property types will be present only when SUPPORT_UCP is
3425     defined, but we don't wrap the little bits of code here because it just
3426     makes it horribly messy. */
3427    
3428     else if (*previous < OP_EODN)
3429     {
3430     uschar *oldcode;
3431 nigel 87 int prop_type, prop_value;
3432 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3433     c = *previous;
3434    
3435 nigel 93 if (!possessive_quantifier &&
3436     repeat_max < 0 &&
3437     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3438     {
3439     repeat_type = 0; /* Force greedy */
3440     possessive_quantifier = TRUE;
3441     }
3442    
3443 nigel 77 OUTPUT_SINGLE_REPEAT:
3444 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3445     {
3446     prop_type = previous[1];
3447     prop_value = previous[2];
3448     }
3449     else prop_type = prop_value = -1;
3450 nigel 77
3451     oldcode = code;
3452     code = previous; /* Usually overwrite previous item */
3453    
3454     /* If the maximum is zero then the minimum must also be zero; Perl allows
3455     this case, so we do too - by simply omitting the item altogether. */
3456    
3457     if (repeat_max == 0) goto END_REPEAT;
3458    
3459     /* All real repeats make it impossible to handle partial matching (maybe
3460     one day we will be able to remove this restriction). */
3461    
3462     if (repeat_max != 1) cd->nopartial = TRUE;
3463    
3464     /* Combine the op_type with the repeat_type */
3465    
3466     repeat_type += op_type;
3467    
3468     /* A minimum of zero is handled either as the special case * or ?, or as
3469     an UPTO, with the maximum given. */
3470    
3471     if (repeat_min == 0)
3472     {
3473     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3474     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3475     else
3476     {
3477     *code++ = OP_UPTO + repeat_type;
3478     PUT2INC(code, 0, repeat_max);
3479     }
3480     }
3481    
3482     /* A repeat minimum of 1 is optimized into some special cases. If the
3483 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3484 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3485     one less than the maximum. */
3486    
3487     else if (repeat_min == 1)
3488     {
3489     if (repeat_max == -1)
3490     *code++ = OP_PLUS + repeat_type;
3491     else
3492     {
3493     code = oldcode; /* leave previous item in place */
3494     if (repeat_max == 1) goto END_REPEAT;
3495     *code++ = OP_UPTO + repeat_type;
3496     PUT2INC(code, 0, repeat_max - 1);
3497     }
3498     }
3499    
3500     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3501     handled as an EXACT followed by an UPTO. */
3502    
3503     else
3504     {
3505     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3506     PUT2INC(code, 0, repeat_min);
3507    
3508     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3509     we have to insert the character for the previous code. For a repeated
3510 nigel 87 Unicode property match, there are two extra bytes that define the
3511 nigel 77 required property. In UTF-8 mode, long characters have their length in
3512     c, with the 0x80 bit as a flag. */
3513    
3514     if (repeat_max < 0)
3515     {
3516     #ifdef SUPPORT_UTF8
3517     if (utf8 && c >= 128)
3518     {
3519     memcpy(code, utf8_char, c & 7);
3520     code += c & 7;
3521     }
3522     else
3523     #endif
3524     {
3525     *code++ = c;
3526 nigel 87 if (prop_type >= 0)
3527     {
3528     *code++ = prop_type;
3529     *code++ = prop_value;
3530     }
3531 nigel 77 }
3532     *code++ = OP_STAR + repeat_type;
3533     }
3534    
3535     /* Else insert an UPTO if the max is greater than the min, again
3536 nigel 93 preceded by the character, for the previously inserted code. If the
3537     UPTO is just for 1 instance, we can use QUERY instead. */
3538 nigel 77
3539     else if (repeat_max != repeat_min)
3540     {
3541     #ifdef SUPPORT_UTF8
3542     if (utf8 && c >= 128)
3543     {
3544     memcpy(code, utf8_char, c & 7);
3545     code += c & 7;
3546     }
3547     else
3548     #endif
3549     *code++ = c;
3550 nigel 87 if (prop_type >= 0)
3551     {
3552     *code++ = prop_type;
3553     *code++ = prop_value;
3554     }
3555 nigel 77 repeat_max -= repeat_min;
3556 nigel 93
3557     if (repeat_max == 1)
3558     {
3559     *code++ = OP_QUERY + repeat_type;
3560     }
3561     else
3562     {
3563     *code++ = OP_UPTO + repeat_type;
3564     PUT2INC(code, 0, repeat_max);
3565     }
3566 nigel 77 }
3567     }
3568    
3569     /* The character or character type itself comes last in all cases. */
3570    
3571     #ifdef SUPPORT_UTF8
3572     if (utf8 && c >= 128)
3573     {
3574     memcpy(code, utf8_char, c & 7);
3575     code += c & 7;
3576     }
3577     else
3578     #endif
3579     *code++ = c;
3580    
3581 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3582     define the required property. */
3583 nigel 77
3584     #ifdef SUPPORT_UCP
3585 nigel 87 if (prop_type >= 0)
3586     {
3587     *code++ = prop_type;
3588     *code++ = prop_value;
3589     }
3590 nigel 77 #endif
3591     }
3592    
3593     /* If previous was a character class or a back reference, we put the repeat
3594     stuff after it, but just skip the item if the repeat was {0,0}. */
3595    
3596     else if (*previous == OP_CLASS ||
3597     *previous == OP_NCLASS ||
3598     #ifdef SUPPORT_UTF8
3599     *previous == OP_XCLASS ||
3600     #endif
3601     *previous == OP_REF)
3602     {
3603     if (repeat_max == 0)
3604     {
3605     code = previous;
3606     goto END_REPEAT;
3607     }
3608    
3609     /* All real repeats make it impossible to handle partial matching (maybe
3610     one day we will be able to remove this restriction). */
3611    
3612     if (repeat_max != 1) cd->nopartial = TRUE;
3613    
3614     if (repeat_min == 0 && repeat_max == -1)
3615     *code++ = OP_CRSTAR + repeat_type;
3616     else if (repeat_min == 1 && repeat_max == -1)
3617     *code++ = OP_CRPLUS + repeat_type;
3618     else if (repeat_min == 0 && repeat_max == 1)
3619     *code++ = OP_CRQUERY + repeat_type;
3620     else
3621     {
3622     *code++ = OP_CRRANGE + repeat_type;
3623     PUT2INC(code, 0, repeat_min);
3624     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3625     PUT2INC(code, 0, repeat_max);
3626     }
3627     }
3628    
3629     /* If previous was a bracket group, we may have to replicate it in certain
3630     cases. */
3631    
3632 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3633     *previous == OP_ONCE || *previous == OP_COND)
3634 nigel 77 {
3635     register int i;
3636     int ketoffset = 0;
3637     int len = code - previous;
3638     uschar *bralink = NULL;
3639    
3640 nigel 93 /* Repeating a DEFINE group is pointless */
3641    
3642     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3643     {
3644     *errorcodeptr = ERR55;
3645     goto FAILED;
3646     }
3647    
3648 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3649     by scanning through from the start, and compute the offset back to it
3650     from the current code pointer. There may be an OP_OPT setting following
3651     the final KET, so we can't find the end just by going back from the code
3652     pointer. */
3653    
3654     if (repeat_max == -1)
3655     {
3656     register uschar *ket = previous;
3657     do ket += GET(ket, 1); while (*ket != OP_KET);
3658     ketoffset = code - ket;
3659     }
3660    
3661     /* The case of a zero minimum is special because of the need to stick
3662     OP_BRAZERO in front of it, and because the group appears once in the
3663     data, whereas in other cases it appears the minimum number of times. For
3664     this reason, it is simplest to treat this case separately, as otherwise
3665     the code gets far too messy. There are several special subcases when the
3666     minimum is zero. */
3667    
3668     if (repeat_min == 0)
3669     {
3670     /* If the maximum is also zero, we just omit the group from the output
3671     altogether. */
3672    
3673     if (repeat_max == 0)
3674     {
3675     code = previous;
3676     goto END_REPEAT;
3677     }
3678    
3679     /* If the maximum is 1 or unlimited, we just have to stick in the
3680     BRAZERO and do no more at this point. However, we do need to adjust
3681     any OP_RECURSE calls inside the group that refer to the group itself or
3682 nigel 93 any internal or forward referenced group, because the offset is from
3683     the start of the whole regex. Temporarily terminate the pattern while
3684     doing this. */
3685 nigel 77
3686     if (repeat_max <= 1)
3687     {
3688     *code = OP_END;
3689 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3690 nigel 77 memmove(previous+1, previous, len);
3691     code++;
3692     *previous++ = OP_BRAZERO + repeat_type;
3693     }
3694    
3695     /* If the maximum is greater than 1 and limited, we have to replicate
3696     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3697     The first one has to be handled carefully because it's the original
3698     copy, which has to be moved up. The remainder can be handled by code
3699     that is common with the non-zero minimum case below. We have to
3700     adjust the value or repeat_max, since one less copy is required. Once
3701     again, we may have to adjust any OP_RECURSE calls inside the group. */
3702    
3703     else
3704     {
3705     int offset;
3706     *code = OP_END;
3707 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3708 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3709     code += 2 + LINK_SIZE;
3710     *previous++ = OP_BRAZERO + repeat_type;
3711     *previous++ = OP_BRA;
3712    
3713     /* We chain together the bracket offset fields that have to be
3714     filled in later when the ends of the brackets are reached. */
3715    
3716     offset = (bralink == NULL)? 0 : previous - bralink;
3717     bralink = previous;
3718     PUTINC(previous, 0, offset);
3719     }
3720    
3721     repeat_max--;
3722     }
3723    
3724     /* If the minimum is greater than zero, replicate the group as many
3725     times as necessary, and adjust the maximum to the number of subsequent
3726     copies that we need. If we set a first char from the group, and didn't
3727 nigel 93 set a required char, copy the latter from the former. If there are any
3728     forward reference subroutine calls in the group, there will be entries on
3729     the workspace list; replicate these with an appropriate increment. */
3730 nigel 77
3731     else
3732     {
3733     if (repeat_min > 1)
3734     {
3735 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3736 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3737     potential integer overflow. */
3738 nigel 93
3739     if (lengthptr != NULL)
3740 ph10 202 {
3741     int delta = (repeat_min - 1)*length_prevgroup;
3742     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3743     (double)INT_MAX ||
3744     OFLOW_MAX - *lengthptr < delta)
3745     {
3746     *errorcodeptr = ERR20;
3747     goto FAILED;
3748     }
3749     *lengthptr += delta;
3750     }
3751 nigel 93
3752     /* This is compiling for real */
3753    
3754     else
3755 nigel 77 {
3756 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3757     for (i = 1; i < repeat_min; i++)
3758     {
3759     uschar *hc;
3760     uschar *this_hwm = cd->hwm;
3761     memcpy(code, previous, len);
3762     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3763     {
3764     PUT(cd->hwm, 0, GET(hc, 0) + len);
3765     cd->hwm += LINK_SIZE;
3766     }
3767     save_hwm = this_hwm;
3768     code += len;
3769     }
3770 nigel 77 }
3771     }
3772 nigel 93
3773 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3774     }
3775    
3776     /* This code is common to both the zero and non-zero minimum cases. If
3777     the maximum is limited, it replicates the group in a nested fashion,
3778     remembering the bracket starts on a stack. In the case of a zero minimum,
3779     the first one was set up above. In all cases the repeat_max now specifies
3780 nigel 93 the number of additional copies needed. Again, we must remember to
3781     replicate entries on the forward reference list. */
3782 nigel 77
3783     if (repeat_max >= 0)
3784     {
3785 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3786     just adjust the length as if we had. For each repetition we must add 1
3787     to the length for BRAZERO and for all but the last repetition we must
3788 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3789     paranoid checks to avoid integer overflow. */
3790 nigel 93
3791     if (lengthptr != NULL && repeat_max > 0)
3792 ph10 202 {
3793     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3794     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3795     if ((double)repeat_max *
3796     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3797     > (double)INT_MAX ||
3798     OFLOW_MAX - *lengthptr < delta)
3799     {
3800     *errorcodeptr = ERR20;
3801     goto FAILED;
3802     }
3803     *lengthptr += delta;
3804     }
3805 nigel 93
3806     /* This is compiling for real */
3807    
3808     else for (i = repeat_max - 1; i >= 0; i--)
3809 nigel 77 {
3810 nigel 93 uschar *hc;
3811     uschar *this_hwm = cd->hwm;
3812    
3813 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3814    
3815     /* All but the final copy start a new nesting, maintaining the
3816     chain of brackets outstanding. */
3817    
3818     if (i != 0)
3819     {
3820     int offset;
3821     *code++ = OP_BRA;
3822     offset = (bralink == NULL)? 0 : code - bralink;
3823     bralink = code;
3824     PUTINC(code, 0, offset);
3825     }
3826    
3827     memcpy(code, previous, len);
3828 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3829     {
3830     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3831     cd->hwm += LINK_SIZE;
3832     }
3833     save_hwm = this_hwm;
3834 nigel 77 code += len;
3835     }
3836    
3837     /* Now chain through the pending brackets, and fill in their length
3838     fields (which are holding the chain links pro tem). */
3839    
3840     while (bralink != NULL)
3841     {
3842     int oldlinkoffset;
3843     int offset = code - bralink + 1;
3844     uschar *bra = code - offset;
3845     oldlinkoffset = GET(bra, 1);
3846     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3847     *code++ = OP_KET;
3848     PUTINC(code, 0, offset);
3849     PUT(bra, 1, offset);
3850     }
3851     }
3852    
3853     /* If the maximum is unlimited, set a repeater in the final copy. We
3854     can't just offset backwards from the current code point, because we
3855     don't know if there's been an options resetting after the ket. The
3856 nigel 93 correct offset was computed above.
3857 nigel 77
3858 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3859     this group is a non-atomic one that could match an empty string. If so,
3860     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3861     that runtime checking can be done. [This check is also applied to
3862     atomic groups at runtime, but in a different way.] */
3863    
3864     else
3865     {
3866     uschar *ketcode = code - ketoffset;
3867     uschar *bracode = ketcode - GET(ketcode, 1);
3868     *ketcode = OP_KETRMAX + repeat_type;
3869     if (lengthptr == NULL && *bracode != OP_ONCE)
3870     {
3871     uschar *scode = bracode;
3872     do
3873     {
3874     if (could_be_empty_branch(scode, ketcode, utf8))
3875     {
3876     *bracode += OP_SBRA - OP_BRA;
3877     break;
3878     }
3879     scode += GET(scode, 1);
3880     }
3881     while (*scode == OP_ALT);
3882     }
3883     }
3884 nigel 77 }
3885    
3886     /* Else there's some kind of shambles */
3887    
3888     else
3889     {
3890     *errorcodeptr = ERR11;
3891     goto FAILED;
3892     }
3893    
3894 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3895     tests above succeeded, possessive_quantifier is TRUE. For some of the
3896     simpler opcodes, there is an special alternative opcode for this. For
3897     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3898     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3899     but the special opcodes can optimize it a bit. The repeated item starts at
3900     tempcode, not at previous, which might be the first part of a string whose
3901     (former) last char we repeated.
3902 nigel 77
3903 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3904     an 'upto' may follow. We skip over an 'exact' item, and then test the
3905     length of what remains before proceeding. */
3906    
3907 nigel 77 if (possessive_quantifier)
3908     {
3909 nigel 93 int len;
3910     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3911     *tempcode == OP_NOTEXACT)
3912     tempcode += _pcre_OP_lengths[*tempcode];
3913     len = code - tempcode;
3914     if (len > 0) switch (*tempcode)
3915     {
3916     case OP_STAR: *tempcode = OP_POSSTAR; break;
3917     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3918     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3919     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3920    
3921     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3922     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3923     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3924     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3925    
3926     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3927     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3928     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3929     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3930    
3931     default:
3932     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3933     code += 1 + LINK_SIZE;
3934     len += 1 + LINK_SIZE;
3935     tempcode[0] = OP_ONCE;
3936     *code++ = OP_KET;
3937     PUTINC(code, 0, len);
3938     PUT(tempcode, 1, len);
3939     break;
3940     }
3941 nigel 77 }
3942    
3943     /* In all case we no longer have a previous item. We also set the
3944     "follows varying string" flag for subsequently encountered reqbytes if
3945     it isn't already set and we have just passed a varying length item. */
3946    
3947     END_REPEAT:
3948     previous = NULL;
3949     cd->req_varyopt |= reqvary;
3950     break;
3951    
3952    
3953 nigel 93 /* ===================================================================*/
3954     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3955     lookbehind or option setting or condition or all the other extended
3956 ph10 210 parenthesis forms. */
3957 nigel 77
3958     case '(':
3959     newoptions = options;
3960     skipbytes = 0;
3961 nigel 93 bravalue = OP_CBRA;
3962     save_hwm = cd->hwm;
3963 ph10 180 reset_bracount = FALSE;
3964 ph10 211
3965 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3966 ph10 211
3967 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3968     {
3969 ph10 211 int i, namelen;
3970 ph10 210 const uschar *name = ++ptr;
3971     previous = NULL;
3972     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3973     if (*ptr == ':')
3974     {
3975     *errorcodeptr = ERR59; /* Not supported */
3976 ph10 211 goto FAILED;
3977     }
3978 ph10 210 if (*ptr != ')')
3979     {
3980     *errorcodeptr = ERR60;
3981     goto FAILED;
3982     }
3983 ph10 211 namelen = ptr - name;
3984 ph10 210 for (i = 0; i < verbcount; i++)
3985 ph10 211 {
3986 ph10 210 if (namelen == verbs[i].len &&
3987     strncmp((char *)name, verbs[i].name, namelen) == 0)
3988     {
3989     *code = verbs[i].op;
3990     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3991     break;
3992 ph10 211 }
3993     }
3994     if (i < verbcount) continue;
3995 ph10 210 *errorcodeptr = ERR60;
3996 ph10 211 goto FAILED;
3997     }
3998    
3999 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
4000     appearance of any of them means that this is not a capturing group. */
4001 nigel 77
4002 ph10 210 else if (*ptr == '?')
4003 nigel 77 {
4004 nigel 93 int i, set, unset, namelen;
4005 nigel 77 int *optset;
4006 nigel 93 const uschar *name;
4007     uschar *slot;
4008 nigel 77
4009     switch (*(++ptr))
4010     {
4011     case '#': /* Comment; skip to ket */
4012     ptr++;
4013 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
4014     if (*ptr == 0)
4015     {
4016     *errorcodeptr = ERR18;
4017     goto FAILED;
4018     }
4019 nigel 77 continue;
4020    
4021 nigel 93
4022     /* ------------------------------------------------------------ */
4023 ph10 175 case '|': /* Reset capture count for each branch */
4024     reset_bracount = TRUE;
4025 ph10 180 /* Fall through */
4026 ph10 175
4027     /* ------------------------------------------------------------ */
4028 nigel 93 case ':': /* Non-capturing bracket */
4029 nigel 77 bravalue = OP_BRA;
4030     ptr++;
4031     break;
4032    
4033 nigel 93
4034     /* ------------------------------------------------------------ */
4035 nigel 77 case '(':
4036     bravalue = OP_COND; /* Conditional group */
4037    
4038 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
4039     group), a name (referring to a named group), or 'R', referring to
4040     recursion. R<digits> and R&name are also permitted for recursion tests.
4041 nigel 77
4042 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
4043     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4044    
4045     There are two unfortunate ambiguities, caused by history. (a) 'R' can
4046     be the recursive thing or the name 'R' (and similarly for 'R' followed
4047     by digits), and (b) a number could be a name that consists of digits.
4048     In both cases, we look for a name first; if not found, we try the other
4049     cases. */
4050    
4051     /* For conditions that are assertions, check the syntax, and then exit
4052     the switch. This will take control down to where bracketed groups,
4053     including assertions, are processed. */
4054    
4055     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4056     break;
4057    
4058     /* Most other conditions use OP_CREF (a couple change to OP_RREF
4059     below), and all need to skip 3 bytes at the start of the group. */
4060    
4061     code[1+LINK_SIZE] = OP_CREF;
4062     skipbytes = 3;
4063 ph10 172 refsign = -1;
4064 nigel 93
4065     /* Check for a test for recursion in a named group. */
4066    
4067     if (ptr[1] == 'R' && ptr[2] == '&')
4068 nigel 77 {
4069 nigel 93 terminator = -1;
4070     ptr += 2;
4071     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4072     }
4073 nigel 91
4074 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4075     syntax (?(<name>) or (?('name') */
4076 nigel 91
4077 nigel 93 else if (ptr[1] == '<')
4078     {
4079     terminator = '>';
4080     ptr++;
4081     }
4082     else if (ptr[1] == '\'')
4083     {
4084     terminator = '\'';
4085     ptr++;
4086     }
4087 ph10 172 else
4088 ph10 167 {
4089     terminator = 0;
4090 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4091     }
4092 nigel 77
4093 nigel 93 /* We now expect to read a name; any thing else is an error */
4094 nigel 77
4095 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4096     {
4097     ptr += 1; /* To get the right offset */
4098     *errorcodeptr = ERR28;
4099     goto FAILED;
4100     }
4101    
4102     /* Read the name, but also get it as a number if it's all digits */
4103    
4104     recno = 0;
4105     name = ++ptr;
4106     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4107     {
4108     if (recno >= 0)
4109     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4110     recno * 10 + *ptr - '0' : -1;
4111 nigel 91 ptr++;
4112 nigel 93 }
4113     namelen = ptr - name;
4114 nigel 91
4115 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4116     {
4117     ptr--; /* Error offset */
4118     *errorcodeptr = ERR26;
4119     goto FAILED;
4120     }
4121 nigel 91
4122 nigel 93 /* Do no further checking in the pre-compile phase. */
4123 nigel 91
4124 nigel 93 if (lengthptr != NULL) break;
4125 nigel 91
4126 nigel 93 /* In the real compile we do the work of looking for the actual
4127 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4128     be digits, in which case recno will be set. */
4129