/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 223 - (hide annotations) (download)
Mon Aug 20 11:07:53 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 189555 byte(s)
Fix loop for classes containing \p or \P and just one ascii character.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285 ph10 213 "(*VERB) not recognized",
286 ph10 220 "number is too big"
287 nigel 77 };
288    
289    
290     /* Table to identify digits and hex digits. This is used when compiling
291     patterns. Note that the tables in chartables are dependent on the locale, and
292     may mark arbitrary characters as digits - but the PCRE compiling code expects
293     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294     a private table here. It costs 256 bytes, but it is a lot faster than doing
295     character value tests (at least in some simple cases I timed), and in some
296     applications one wants PCRE to compile efficiently as well as match
297     efficiently.
298    
299     For convenience, we use the same bit definitions as in chartables:
300    
301     0x04 decimal digit
302     0x08 hexadecimal digit
303    
304     Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 nigel 77 static const unsigned char digitab[] =
308     {
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
343 nigel 77 static const unsigned char digitab[] =
344     {
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377    
378     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411     #endif
412    
413    
414     /* Definition to allow mutual recursion */
415    
416     static BOOL
417 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
419 nigel 77
420    
421    
422     /*************************************************
423     * Handle escapes *
424     *************************************************/
425    
426     /* This function is called when a \ has been encountered. It either returns a
427     positive value for a simple escape such as \n, or a negative value which
428 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
429     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431     ptr is pointing at the \. On exit, it is on the final character of the escape
432     sequence.
433 nigel 77
434     Arguments:
435     ptrptr points to the pattern position pointer
436     errorcodeptr points to the errorcode variable
437     bracount number of previous extracting brackets
438     options the options bits
439     isclass TRUE if inside a character class
440    
441     Returns: zero or positive => a data character
442     negative => a special escape sequence
443 ph10 213 on error, errorcodeptr is set
444 nigel 77 */
445    
446     static int
447     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448     int options, BOOL isclass)
449     {
450 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
451     const uschar *ptr = *ptrptr + 1;
452 nigel 77 int c, i;
453    
454 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455     ptr--; /* Set pointer back to the last byte */
456    
457 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
458    
459     if (c == 0) *errorcodeptr = ERR1;
460    
461     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462     a table. A non-zero result is something that can be returned immediately.
463     Otherwise further processing may be required. */
464    
465 ph10 97 #ifndef EBCDIC /* ASCII coding */
466 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
467     else if ((i = escapes[c - '0']) != 0) c = i;
468    
469 ph10 97 #else /* EBCDIC coding */
470 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471     else if ((i = escapes[c - 0x48]) != 0) c = i;
472     #endif
473    
474     /* Escapes that need further processing, or are illegal. */
475    
476     else
477     {
478     const uschar *oldptr;
479 nigel 93 BOOL braced, negated;
480    
481 nigel 77 switch (c)
482     {
483     /* A number of Perl escapes are not handled by PCRE. We give an explicit
484     error. */
485    
486     case 'l':
487     case 'L':
488     case 'N':
489     case 'u':
490     case 'U':
491     *errorcodeptr = ERR37;
492     break;
493    
494 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
495     is an absolute backreference. If negative, it is a relative backreference.
496 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497     reference to a named group. This is part of Perl's movement towards a
498     unified syntax for back references. As this is synonymous with \k{name}, we
499 ph10 171 fudge it up by pretending it really was \k. */
500 nigel 93
501     case 'g':
502     if (ptr[1] == '{')
503     {
504 ph10 171 const uschar *p;
505     for (p = ptr+2; *p != 0 && *p != '}'; p++)
506     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 ph10 172 if (*p != 0 && *p != '}')
508 ph10 171 {
509     c = -ESC_k;
510     break;
511 ph10 172 }
512 nigel 93 braced = TRUE;
513     ptr++;
514     }
515     else braced = FALSE;
516    
517     if (ptr[1] == '-')
518     {
519     negated = TRUE;
520     ptr++;
521     }
522     else negated = FALSE;
523    
524     c = 0;
525     while ((digitab[ptr[1]] & ctype_digit) != 0)
526     c = c * 10 + *(++ptr) - '0';
527 ph10 220
528 ph10 213 if (c < 0)
529     {
530     *errorcodeptr = ERR61;
531     break;
532 ph10 220 }
533 nigel 93
534     if (c == 0 || (braced && *(++ptr) != '}'))
535     {
536     *errorcodeptr = ERR57;
537 ph10 213 break;
538 nigel 93 }
539    
540     if (negated)
541     {
542     if (c > bracount)
543     {
544     *errorcodeptr = ERR15;
545 ph10 213 break;
546 nigel 93 }
547     c = bracount - (c - 1);
548     }
549    
550     c = -(ESC_REF + c);
551     break;
552    
553 nigel 77 /* The handling of escape sequences consisting of a string of digits
554     starting with one that is not zero is not straightforward. By experiment,
555     the way Perl works seems to be as follows:
556    
557     Outside a character class, the digits are read as a decimal number. If the
558     number is less than 10, or if there are that many previous extracting
559     left brackets, then it is a back reference. Otherwise, up to three octal
560     digits are read to form an escaped byte. Thus \123 is likely to be octal
561     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562     value is greater than 377, the least significant 8 bits are taken. Inside a
563     character class, \ followed by a digit is always an octal number. */
564    
565     case '1': case '2': case '3': case '4': case '5':
566     case '6': case '7': case '8': case '9':
567    
568     if (!isclass)
569     {
570     oldptr = ptr;
571     c -= '0';
572     while ((digitab[ptr[1]] & ctype_digit) != 0)
573     c = c * 10 + *(++ptr) - '0';
574 ph10 213 if (c < 0)
575     {
576     *errorcodeptr = ERR61;
577 ph10 220 break;
578     }
579 nigel 77 if (c < 10 || c <= bracount)
580     {
581     c = -(ESC_REF + c);
582     break;
583     }
584     ptr = oldptr; /* Put the pointer back and fall through */
585     }
586    
587     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588     generates a binary zero byte and treats the digit as a following literal.
589     Thus we have to pull back the pointer by one. */
590    
591     if ((c = *ptr) >= '8')
592     {
593     ptr--;
594     c = 0;
595     break;
596     }
597    
598     /* \0 always starts an octal number, but we may drop through to here with a
599 nigel 91 larger first octal digit. The original code used just to take the least
600     significant 8 bits of octal numbers (I think this is what early Perls used
601     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602     than 3 octal digits. */
603 nigel 77
604     case '0':
605     c -= '0';
606     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607     c = c * 8 + *(++ptr) - '0';
608 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 nigel 77 break;
610    
611 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
612     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613     treated as a data character. */
614 nigel 77
615     case 'x':
616 nigel 87 if (ptr[1] == '{')
617 nigel 77 {
618     const uschar *pt = ptr + 2;
619 nigel 87 int count = 0;
620    
621 nigel 77 c = 0;
622     while ((digitab[*pt] & ctype_xdigit) != 0)
623     {
624 nigel 87 register int cc = *pt++;
625     if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 nigel 77 count++;
627 nigel 87
628 ph10 97 #ifndef EBCDIC /* ASCII coding */
629 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 ph10 97 #else /* EBCDIC coding */
632 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 nigel 77 #endif
635     }
636 nigel 87
637 nigel 77 if (*pt == '}')
638     {
639 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 nigel 77 ptr = pt;
641     break;
642     }
643 nigel 87
644 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
645     recognize this construct; fall through to the normal \x handling. */
646     }
647    
648 nigel 87 /* Read just a single-byte hex-defined char */
649 nigel 77
650     c = 0;
651     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652     {
653     int cc; /* Some compilers don't like ++ */
654     cc = *(++ptr); /* in initializers */
655 ph10 97 #ifndef EBCDIC /* ASCII coding */
656 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 ph10 97 #else /* EBCDIC coding */
659 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
660     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661     #endif
662     }
663     break;
664    
665 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666     This coding is ASCII-specific, but then the whole concept of \cx is
667     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668 nigel 77
669     case 'c':
670     c = *(++ptr);
671     if (c == 0)
672     {
673     *errorcodeptr = ERR2;
674 ph10 213 break;
675 nigel 77 }
676    
677 ph10 97 #ifndef EBCDIC /* ASCII coding */
678 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
679     c ^= 0x40;
680 ph10 97 #else /* EBCDIC coding */
681 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
682     c ^= 0xC0;
683     #endif
684     break;
685    
686     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688     for Perl compatibility, it is a literal. This code looks a bit odd, but
689     there used to be some cases other than the default, and there may be again
690     in future, so I haven't "optimized" it. */
691    
692     default:
693     if ((options & PCRE_EXTRA) != 0) switch(c)
694     {
695     default:
696     *errorcodeptr = ERR3;
697     break;
698     }
699     break;
700     }
701     }
702    
703     *ptrptr = ptr;
704     return c;
705     }
706    
707    
708    
709     #ifdef SUPPORT_UCP
710     /*************************************************
711     * Handle \P and \p *
712     *************************************************/
713    
714     /* This function is called after \P or \p has been encountered, provided that
715     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716     pointing at the P or p. On exit, it is pointing at the final character of the
717     escape sequence.
718    
719     Argument:
720     ptrptr points to the pattern position pointer
721     negptr points to a boolean that is set TRUE for negation else FALSE
722 nigel 87 dptr points to an int that is set to the detailed property value
723 nigel 77 errorcodeptr points to the error code variable
724    
725 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
726 nigel 77 */
727    
728     static int
729 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 nigel 77 {
731     int c, i, bot, top;
732     const uschar *ptr = *ptrptr;
733 nigel 87 char name[32];
734 nigel 77
735     c = *(++ptr);
736     if (c == 0) goto ERROR_RETURN;
737    
738     *negptr = FALSE;
739    
740 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741     negation. */
742 nigel 77
743     if (c == '{')
744     {
745     if (ptr[1] == '^')
746     {
747     *negptr = TRUE;
748     ptr++;
749     }
750 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 nigel 77 {
752     c = *(++ptr);
753     if (c == 0) goto ERROR_RETURN;
754     if (c == '}') break;
755     name[i] = c;
756     }
757 nigel 87 if (c !='}') goto ERROR_RETURN;
758 nigel 77 name[i] = 0;
759     }
760    
761     /* Otherwise there is just one following character */
762    
763     else
764     {
765     name[0] = c;
766     name[1] = 0;
767     }
768    
769     *ptrptr = ptr;
770    
771     /* Search for a recognized property name using binary chop */
772    
773     bot = 0;
774     top = _pcre_utt_size;
775    
776     while (bot < top)
777     {
778 nigel 87 i = (bot + top) >> 1;
779 nigel 77 c = strcmp(name, _pcre_utt[i].name);
780 nigel 87 if (c == 0)
781     {
782     *dptr = _pcre_utt[i].value;
783     return _pcre_utt[i].type;
784     }
785 nigel 77 if (c > 0) bot = i + 1; else top = i;
786     }
787    
788     *errorcodeptr = ERR47;
789     *ptrptr = ptr;
790     return -1;
791    
792     ERROR_RETURN:
793     *errorcodeptr = ERR46;
794     *ptrptr = ptr;
795     return -1;
796     }
797     #endif
798    
799    
800    
801    
802     /*************************************************
803     * Check for counted repeat *
804     *************************************************/
805    
806     /* This function is called when a '{' is encountered in a place where it might
807     start a quantifier. It looks ahead to see if it really is a quantifier or not.
808     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809     where the ddds are digits.
810    
811     Arguments:
812     p pointer to the first char after '{'
813    
814     Returns: TRUE or FALSE
815     */
816    
817     static BOOL
818     is_counted_repeat(const uschar *p)
819     {
820     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821     while ((digitab[*p] & ctype_digit) != 0) p++;
822     if (*p == '}') return TRUE;
823    
824     if (*p++ != ',') return FALSE;
825     if (*p == '}') return TRUE;
826    
827     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828     while ((digitab[*p] & ctype_digit) != 0) p++;
829    
830     return (*p == '}');
831     }
832    
833    
834    
835     /*************************************************
836     * Read repeat counts *
837     *************************************************/
838    
839     /* Read an item of the form {n,m} and return the values. This is called only
840     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841     so the syntax is guaranteed to be correct, but we need to check the values.
842    
843     Arguments:
844     p pointer to first char after '{'
845     minp pointer to int for min
846     maxp pointer to int for max
847     returned as -1 if no max
848     errorcodeptr points to error code variable
849    
850     Returns: pointer to '}' on success;
851     current ptr on error, with errorcodeptr set non-zero
852     */
853    
854     static const uschar *
855     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856     {
857     int min = 0;
858     int max = -1;
859    
860 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
861     an integer overflow. */
862    
863 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 nigel 81 if (min < 0 || min > 65535)
865     {
866     *errorcodeptr = ERR5;
867     return p;
868     }
869 nigel 77
870 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
871     Also, max must not be less than min. */
872    
873 nigel 77 if (*p == '}') max = min; else
874     {
875     if (*(++p) != '}')
876     {
877     max = 0;
878     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 nigel 81 if (max < 0 || max > 65535)
880     {
881     *errorcodeptr = ERR5;
882     return p;
883     }
884 nigel 77 if (max < min)
885     {
886     *errorcodeptr = ERR4;
887     return p;
888     }
889     }
890     }
891    
892 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
893     '}'. */
894 nigel 77
895 nigel 81 *minp = min;
896     *maxp = max;
897 nigel 77 return p;
898     }
899    
900    
901    
902     /*************************************************
903 nigel 93 * Find forward referenced subpattern *
904 nigel 91 *************************************************/
905    
906 nigel 93 /* This function scans along a pattern's text looking for capturing
907     subpatterns, and counting them. If it finds a named pattern that matches the
908     name it is given, it returns its number. Alternatively, if the name is NULL, it
909     returns when it reaches a given numbered subpattern. This is used for forward
910     references to subpatterns. We know that if (?P< is encountered, the name will
911     be terminated by '>' because that is checked in the first pass.
912 nigel 91
913     Arguments:
914 nigel 93 ptr current position in the pattern
915     count current count of capturing parens so far encountered
916     name name to seek, or NULL if seeking a numbered subpattern
917     lorn name length, or subpattern number if name is NULL
918     xmode TRUE if we are in /x mode
919 nigel 91
920     Returns: the number of the named subpattern, or -1 if not found
921     */
922    
923     static int
924 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925     BOOL xmode)
926 nigel 91 {
927     const uschar *thisname;
928 nigel 93
929 nigel 91 for (; *ptr != 0; ptr++)
930     {
931 nigel 93 int term;
932    
933     /* Skip over backslashed characters and also entire \Q...\E */
934    
935     if (*ptr == '\\')
936     {
937     if (*(++ptr) == 0) return -1;
938     if (*ptr == 'Q') for (;;)
939     {
940     while (*(++ptr) != 0 && *ptr != '\\');
941     if (*ptr == 0) return -1;
942     if (*(++ptr) == 'E') break;
943     }
944     continue;
945     }
946    
947     /* Skip over character classes */
948    
949     if (*ptr == '[')
950     {
951     while (*(++ptr) != ']')
952     {
953 ph10 220 if (*ptr == 0) return -1;
954 nigel 93 if (*ptr == '\\')
955     {
956     if (*(++ptr) == 0) return -1;
957     if (*ptr == 'Q') for (;;)
958     {
959     while (*(++ptr) != 0 && *ptr != '\\');
960     if (*ptr == 0) return -1;
961     if (*(++ptr) == 'E') break;
962     }
963     continue;
964     }
965     }
966     continue;
967     }
968    
969     /* Skip comments in /x mode */
970    
971     if (xmode && *ptr == '#')
972     {
973     while (*(++ptr) != 0 && *ptr != '\n');
974     if (*ptr == 0) return -1;
975     continue;
976     }
977    
978     /* An opening parens must now be a real metacharacter */
979    
980 nigel 91 if (*ptr != '(') continue;
981 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
982 nigel 93 {
983     count++;
984     if (name == NULL && count == lorn) return count;
985     continue;
986     }
987    
988     ptr += 2;
989     if (*ptr == 'P') ptr++; /* Allow optional P */
990    
991     /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994     *ptr != '\'')
995     continue;
996    
997 nigel 91 count++;
998 nigel 93
999     if (name == NULL && count == lorn) return count;
1000     term = *ptr++;
1001     if (term == '<') term = '>';
1002 nigel 91 thisname = ptr;
1003 nigel 93 while (*ptr != term) ptr++;
1004     if (name != NULL && lorn == ptr - thisname &&
1005     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 nigel 91 return count;
1007     }
1008 nigel 93
1009 nigel 91 return -1;
1010     }
1011    
1012    
1013    
1014     /*************************************************
1015 nigel 77 * Find first significant op code *
1016     *************************************************/
1017    
1018     /* This is called by several functions that scan a compiled expression looking
1019     for a fixed first character, or an anchoring op code etc. It skips over things
1020     that do not influence this. For some calls, a change of option is important.
1021     For some calls, it makes sense to skip negative forward and all backward
1022     assertions, and also the \b assertion; for others it does not.
1023    
1024     Arguments:
1025     code pointer to the start of the group
1026     options pointer to external options
1027     optbit the option bit whose changing is significant, or
1028     zero if none are
1029     skipassert TRUE if certain assertions are to be skipped
1030    
1031     Returns: pointer to the first significant opcode
1032     */
1033    
1034     static const uschar*
1035     first_significant_code(const uschar *code, int *options, int optbit,
1036     BOOL skipassert)
1037     {
1038     for (;;)
1039     {
1040     switch ((int)*code)
1041     {
1042     case OP_OPT:
1043     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044     *options = (int)code[1];
1045     code += 2;
1046     break;
1047    
1048     case OP_ASSERT_NOT:
1049     case OP_ASSERTBACK:
1050     case OP_ASSERTBACK_NOT:
1051     if (!skipassert) return code;
1052     do code += GET(code, 1); while (*code == OP_ALT);
1053     code += _pcre_OP_lengths[*code];
1054     break;
1055    
1056     case OP_WORD_BOUNDARY:
1057     case OP_NOT_WORD_BOUNDARY:
1058     if (!skipassert) return code;
1059     /* Fall through */
1060    
1061     case OP_CALLOUT:
1062     case OP_CREF:
1063 nigel 93 case OP_RREF:
1064     case OP_DEF:
1065 nigel 77 code += _pcre_OP_lengths[*code];
1066     break;
1067    
1068     default:
1069     return code;
1070     }
1071     }
1072     /* Control never reaches here */
1073     }
1074    
1075    
1076    
1077    
1078     /*************************************************
1079     * Find the fixed length of a pattern *
1080     *************************************************/
1081    
1082     /* Scan a pattern and compute the fixed length of subject that will match it,
1083     if the length is fixed. This is needed for dealing with backward assertions.
1084     In UTF8 mode, the result is in characters rather than bytes.
1085    
1086     Arguments:
1087     code points to the start of the pattern (the bracket)
1088     options the compiling options
1089    
1090     Returns: the fixed length, or -1 if there is no fixed length,
1091     or -2 if \C was encountered
1092     */
1093    
1094     static int
1095     find_fixedlength(uschar *code, int options)
1096     {
1097     int length = -1;
1098    
1099     register int branchlength = 0;
1100     register uschar *cc = code + 1 + LINK_SIZE;
1101    
1102     /* Scan along the opcodes for this branch. If we get to the end of the
1103     branch, check the length against that of the other branches. */
1104    
1105     for (;;)
1106     {
1107     int d;
1108     register int op = *cc;
1109     switch (op)
1110     {
1111 nigel 93 case OP_CBRA:
1112 nigel 77 case OP_BRA:
1113     case OP_ONCE:
1114     case OP_COND:
1115 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 nigel 77 if (d < 0) return d;
1117     branchlength += d;
1118     do cc += GET(cc, 1); while (*cc == OP_ALT);
1119     cc += 1 + LINK_SIZE;
1120     break;
1121    
1122     /* Reached end of a branch; if it's a ket it is the end of a nested
1123     call. If it's ALT it is an alternation in a nested call. If it is
1124     END it's the end of the outer call. All can be handled by the same code. */
1125    
1126     case OP_ALT:
1127     case OP_KET:
1128     case OP_KETRMAX:
1129     case OP_KETRMIN:
1130     case OP_END:
1131     if (length < 0) length = branchlength;
1132     else if (length != branchlength) return -1;
1133     if (*cc != OP_ALT) return length;
1134     cc += 1 + LINK_SIZE;
1135     branchlength = 0;
1136     break;
1137    
1138     /* Skip over assertive subpatterns */
1139    
1140     case OP_ASSERT:
1141     case OP_ASSERT_NOT:
1142     case OP_ASSERTBACK:
1143     case OP_ASSERTBACK_NOT:
1144     do cc += GET(cc, 1); while (*cc == OP_ALT);
1145     /* Fall through */
1146    
1147     /* Skip over things that don't match chars */
1148    
1149     case OP_REVERSE:
1150     case OP_CREF:
1151 nigel 93 case OP_RREF:
1152     case OP_DEF:
1153 nigel 77 case OP_OPT:
1154     case OP_CALLOUT:
1155     case OP_SOD:
1156     case OP_SOM:
1157     case OP_EOD:
1158     case OP_EODN:
1159     case OP_CIRC:
1160     case OP_DOLL:
1161     case OP_NOT_WORD_BOUNDARY:
1162     case OP_WORD_BOUNDARY:
1163     cc += _pcre_OP_lengths[*cc];
1164     break;
1165    
1166     /* Handle literal characters */
1167    
1168     case OP_CHAR:
1169     case OP_CHARNC:
1170 nigel 91 case OP_NOT:
1171 nigel 77 branchlength++;
1172     cc += 2;
1173     #ifdef SUPPORT_UTF8
1174     if ((options & PCRE_UTF8) != 0)
1175     {
1176     while ((*cc & 0xc0) == 0x80) cc++;
1177     }
1178     #endif
1179     break;
1180    
1181     /* Handle exact repetitions. The count is already in characters, but we
1182     need to skip over a multibyte character in UTF8 mode. */
1183    
1184     case OP_EXACT:
1185     branchlength += GET2(cc,1);
1186     cc += 4;
1187     #ifdef SUPPORT_UTF8
1188     if ((options & PCRE_UTF8) != 0)
1189     {
1190     while((*cc & 0x80) == 0x80) cc++;
1191     }
1192     #endif
1193     break;
1194    
1195     case OP_TYPEEXACT:
1196     branchlength += GET2(cc,1);
1197 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 nigel 77 cc += 4;
1199     break;
1200    
1201     /* Handle single-char matchers */
1202    
1203     case OP_PROP:
1204     case OP_NOTPROP:
1205 nigel 87 cc += 2;
1206 nigel 77 /* Fall through */
1207    
1208     case OP_NOT_DIGIT:
1209     case OP_DIGIT:
1210     case OP_NOT_WHITESPACE:
1211     case OP_WHITESPACE:
1212     case OP_NOT_WORDCHAR:
1213     case OP_WORDCHAR:
1214     case OP_ANY:
1215     branchlength++;
1216     cc++;
1217     break;
1218    
1219     /* The single-byte matcher isn't allowed */
1220    
1221     case OP_ANYBYTE:
1222     return -2;
1223    
1224     /* Check a class for variable quantification */
1225    
1226     #ifdef SUPPORT_UTF8
1227     case OP_XCLASS:
1228     cc += GET(cc, 1) - 33;
1229     /* Fall through */
1230     #endif
1231    
1232     case OP_CLASS:
1233     case OP_NCLASS:
1234     cc += 33;
1235    
1236     switch (*cc)
1237     {
1238     case OP_CRSTAR:
1239     case OP_CRMINSTAR:
1240     case OP_CRQUERY:
1241     case OP_CRMINQUERY:
1242     return -1;
1243    
1244     case OP_CRRANGE:
1245     case OP_CRMINRANGE:
1246     if (GET2(cc,1) != GET2(cc,3)) return -1;
1247     branchlength += GET2(cc,1);
1248     cc += 5;
1249     break;
1250    
1251     default:
1252     branchlength++;
1253     }
1254     break;
1255    
1256     /* Anything else is variable length */
1257    
1258     default:
1259     return -1;
1260     }
1261     }
1262     /* Control never gets here */
1263     }
1264    
1265    
1266    
1267    
1268     /*************************************************
1269     * Scan compiled regex for numbered bracket *
1270     *************************************************/
1271    
1272     /* This little function scans through a compiled pattern until it finds a
1273     capturing bracket with the given number.
1274    
1275     Arguments:
1276     code points to start of expression
1277     utf8 TRUE in UTF-8 mode
1278     number the required bracket number
1279    
1280     Returns: pointer to the opcode for the bracket, or NULL if not found
1281     */
1282    
1283     static const uschar *
1284     find_bracket(const uschar *code, BOOL utf8, int number)
1285     {
1286     for (;;)
1287     {
1288     register int c = *code;
1289     if (c == OP_END) return NULL;
1290 nigel 91
1291     /* XCLASS is used for classes that cannot be represented just by a bit
1292     map. This includes negated single high-valued characters. The length in
1293     the table is zero; the actual length is stored in the compiled code. */
1294    
1295     if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297 nigel 93 /* Handle capturing bracket */
1298 nigel 91
1299 nigel 93 else if (c == OP_CBRA)
1300 nigel 77 {
1301 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1302 nigel 77 if (n == number) return (uschar *)code;
1303 nigel 93 code += _pcre_OP_lengths[c];
1304 nigel 77 }
1305 nigel 91
1306 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1307     repeated character types, we have to test for \p and \P, which have an extra
1308 ph10 218 two bytes of parameters. */
1309 nigel 91
1310 nigel 77 else
1311     {
1312 ph10 218 switch(c)
1313     {
1314     case OP_TYPESTAR:
1315     case OP_TYPEMINSTAR:
1316     case OP_TYPEPLUS:
1317     case OP_TYPEMINPLUS:
1318     case OP_TYPEQUERY:
1319     case OP_TYPEMINQUERY:
1320     case OP_TYPEPOSSTAR:
1321     case OP_TYPEPOSPLUS:
1322     case OP_TYPEPOSQUERY:
1323     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324 ph10 220 break;
1325 ph10 221
1326     case OP_TYPEUPTO:
1327     case OP_TYPEMINUPTO:
1328     case OP_TYPEEXACT:
1329     case OP_TYPEPOSUPTO:
1330     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331     break;
1332 ph10 220 }
1333    
1334 ph10 218 /* Add in the fixed length from the table */
1335 ph10 220
1336 nigel 77 code += _pcre_OP_lengths[c];
1337 ph10 220
1338 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339     a multi-byte character. The length in the table is a minimum, so we have to
1340     arrange to skip the extra bytes. */
1341 ph10 220
1342 ph10 107 #ifdef SUPPORT_UTF8
1343 nigel 77 if (utf8) switch(c)
1344     {
1345     case OP_CHAR:
1346     case OP_CHARNC:
1347     case OP_EXACT:
1348     case OP_UPTO:
1349     case OP_MINUPTO:
1350 nigel 93 case OP_POSUPTO:
1351 nigel 77 case OP_STAR:
1352     case OP_MINSTAR:
1353 nigel 93 case OP_POSSTAR:
1354 nigel 77 case OP_PLUS:
1355     case OP_MINPLUS:
1356 nigel 93 case OP_POSPLUS:
1357 nigel 77 case OP_QUERY:
1358     case OP_MINQUERY:
1359 nigel 93 case OP_POSQUERY:
1360     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 nigel 77 break;
1362     }
1363 ph10 111 #endif
1364 nigel 77 }
1365     }
1366     }
1367    
1368    
1369    
1370     /*************************************************
1371     * Scan compiled regex for recursion reference *
1372     *************************************************/
1373    
1374     /* This little function scans through a compiled pattern until it finds an
1375     instance of OP_RECURSE.
1376    
1377     Arguments:
1378     code points to start of expression
1379     utf8 TRUE in UTF-8 mode
1380    
1381     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1382     */
1383    
1384     static const uschar *
1385     find_recurse(const uschar *code, BOOL utf8)
1386     {
1387     for (;;)
1388     {
1389     register int c = *code;
1390     if (c == OP_END) return NULL;
1391 nigel 91 if (c == OP_RECURSE) return code;
1392 ph10 220
1393 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1394     map. This includes negated single high-valued characters. The length in
1395     the table is zero; the actual length is stored in the compiled code. */
1396    
1397     if (c == OP_XCLASS) code += GET(code, 1);
1398    
1399 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1400     repeated character types, we have to test for \p and \P, which have an extra
1401 ph10 218 two bytes of parameters. */
1402 nigel 91
1403 nigel 77 else
1404     {
1405 ph10 218 switch(c)
1406     {
1407     case OP_TYPESTAR:
1408     case OP_TYPEMINSTAR:
1409     case OP_TYPEPLUS:
1410     case OP_TYPEMINPLUS:
1411     case OP_TYPEQUERY:
1412     case OP_TYPEMINQUERY:
1413     case OP_TYPEPOSSTAR:
1414     case OP_TYPEPOSPLUS:
1415     case OP_TYPEPOSQUERY:
1416     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417 ph10 220 break;
1418 ph10 221
1419     case OP_TYPEPOSUPTO:
1420     case OP_TYPEUPTO:
1421     case OP_TYPEMINUPTO:
1422     case OP_TYPEEXACT:
1423     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424     break;
1425 ph10 220 }
1426    
1427 ph10 218 /* Add in the fixed length from the table */
1428    
1429 nigel 77 code += _pcre_OP_lengths[c];
1430 ph10 220
1431 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432     by a multi-byte character. The length in the table is a minimum, so we have
1433     to arrange to skip the extra bytes. */
1434 ph10 220
1435 ph10 107 #ifdef SUPPORT_UTF8
1436 nigel 77 if (utf8) switch(c)
1437     {
1438     case OP_CHAR:
1439     case OP_CHARNC:
1440     case OP_EXACT:
1441     case OP_UPTO:
1442     case OP_MINUPTO:
1443 nigel 93 case OP_POSUPTO:
1444 nigel 77 case OP_STAR:
1445     case OP_MINSTAR:
1446 nigel 93 case OP_POSSTAR:
1447 nigel 77 case OP_PLUS:
1448     case OP_MINPLUS:
1449 nigel 93 case OP_POSPLUS:
1450 nigel 77 case OP_QUERY:
1451     case OP_MINQUERY:
1452 nigel 93 case OP_POSQUERY:
1453     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1454 nigel 77 break;
1455     }
1456 ph10 111 #endif
1457 nigel 77 }
1458     }
1459     }
1460    
1461    
1462    
1463     /*************************************************
1464     * Scan compiled branch for non-emptiness *
1465     *************************************************/
1466    
1467     /* This function scans through a branch of a compiled pattern to see whether it
1468 nigel 93 can match the empty string or not. It is called from could_be_empty()
1469     below and from compile_branch() when checking for an unlimited repeat of a
1470     group that can match nothing. Note that first_significant_code() skips over
1471     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472     struck an inner bracket whose current branch will already have been scanned.
1473 nigel 77
1474     Arguments:
1475     code points to start of search
1476     endcode points to where to stop
1477     utf8 TRUE if in UTF8 mode
1478    
1479     Returns: TRUE if what is matched could be empty
1480     */
1481    
1482     static BOOL
1483     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484     {
1485     register int c;
1486 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487 nigel 77 code < endcode;
1488     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489     {
1490     const uschar *ccode;
1491    
1492     c = *code;
1493 ph10 172
1494 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1495 nigel 77
1496 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497     {
1498 ph10 172 code += _pcre_OP_lengths[c];
1499 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1500     c = *code;
1501     continue;
1502     }
1503    
1504     /* For other groups, scan the branches. */
1505 ph10 172
1506 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507 nigel 77 {
1508     BOOL empty_branch;
1509     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1510    
1511     /* Scan a closed bracket */
1512    
1513     empty_branch = FALSE;
1514     do
1515     {
1516     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1517     empty_branch = TRUE;
1518     code += GET(code, 1);
1519     }
1520     while (*code == OP_ALT);
1521     if (!empty_branch) return FALSE; /* All branches are non-empty */
1522 ph10 172 c = *code;
1523 nigel 93 continue;
1524 nigel 77 }
1525    
1526 nigel 93 /* Handle the other opcodes */
1527    
1528     switch (c)
1529 nigel 77 {
1530 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1531     cannot be represented just by a bit map. This includes negated single
1532     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533 ph10 220 actual length is stored in the compiled code, so we must update "code"
1534 ph10 216 here. */
1535 nigel 77
1536     #ifdef SUPPORT_UTF8
1537     case OP_XCLASS:
1538 ph10 216 ccode = code += GET(code, 1);
1539 nigel 77 goto CHECK_CLASS_REPEAT;
1540     #endif
1541    
1542     case OP_CLASS:
1543     case OP_NCLASS:
1544     ccode = code + 33;
1545    
1546     #ifdef SUPPORT_UTF8
1547     CHECK_CLASS_REPEAT:
1548     #endif
1549    
1550     switch (*ccode)
1551     {
1552     case OP_CRSTAR: /* These could be empty; continue */
1553     case OP_CRMINSTAR:
1554     case OP_CRQUERY:
1555     case OP_CRMINQUERY:
1556     break;
1557    
1558     default: /* Non-repeat => class must match */
1559     case OP_CRPLUS: /* These repeats aren't empty */
1560     case OP_CRMINPLUS:
1561     return FALSE;
1562    
1563     case OP_CRRANGE:
1564     case OP_CRMINRANGE:
1565     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1566     break;
1567     }
1568     break;
1569    
1570     /* Opcodes that must match a character */
1571    
1572     case OP_PROP:
1573     case OP_NOTPROP:
1574     case OP_EXTUNI:
1575     case OP_NOT_DIGIT:
1576     case OP_DIGIT:
1577     case OP_NOT_WHITESPACE:
1578     case OP_WHITESPACE:
1579     case OP_NOT_WORDCHAR:
1580     case OP_WORDCHAR:
1581     case OP_ANY:
1582     case OP_ANYBYTE:
1583     case OP_CHAR:
1584     case OP_CHARNC:
1585     case OP_NOT:
1586     case OP_PLUS:
1587     case OP_MINPLUS:
1588 nigel 93 case OP_POSPLUS:
1589 nigel 77 case OP_EXACT:
1590     case OP_NOTPLUS:
1591     case OP_NOTMINPLUS:
1592 nigel 93 case OP_NOTPOSPLUS:
1593 nigel 77 case OP_NOTEXACT:
1594     case OP_TYPEPLUS:
1595     case OP_TYPEMINPLUS:
1596 nigel 93 case OP_TYPEPOSPLUS:
1597 nigel 77 case OP_TYPEEXACT:
1598     return FALSE;
1599    
1600     /* End of branch */
1601    
1602     case OP_KET:
1603     case OP_KETRMAX:
1604     case OP_KETRMIN:
1605     case OP_ALT:
1606     return TRUE;
1607    
1608 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1609     MINUPTO, and POSUPTO may be followed by a multibyte character */
1610 nigel 77
1611     #ifdef SUPPORT_UTF8
1612     case OP_STAR:
1613     case OP_MINSTAR:
1614 nigel 93 case OP_POSSTAR:
1615 nigel 77 case OP_QUERY:
1616     case OP_MINQUERY:
1617 nigel 93 case OP_POSQUERY:
1618 nigel 77 case OP_UPTO:
1619     case OP_MINUPTO:
1620 nigel 93 case OP_POSUPTO:
1621 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1622     break;
1623     #endif
1624     }
1625     }
1626    
1627     return TRUE;
1628     }
1629    
1630    
1631    
1632     /*************************************************
1633     * Scan compiled regex for non-emptiness *
1634     *************************************************/
1635    
1636     /* This function is called to check for left recursive calls. We want to check
1637     the current branch of the current pattern to see if it could match the empty
1638     string. If it could, we must look outwards for branches at other levels,
1639     stopping when we pass beyond the bracket which is the subject of the recursion.
1640    
1641     Arguments:
1642     code points to start of the recursion
1643     endcode points to where to stop (current RECURSE item)
1644     bcptr points to the chain of current (unclosed) branch starts
1645     utf8 TRUE if in UTF-8 mode
1646    
1647     Returns: TRUE if what is matched could be empty
1648     */
1649    
1650     static BOOL
1651     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1652     BOOL utf8)
1653     {
1654     while (bcptr != NULL && bcptr->current >= code)
1655     {
1656     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1657     bcptr = bcptr->outer;
1658     }
1659     return TRUE;
1660     }
1661    
1662    
1663    
1664     /*************************************************
1665     * Check for POSIX class syntax *
1666     *************************************************/
1667    
1668     /* This function is called when the sequence "[:" or "[." or "[=" is
1669     encountered in a character class. It checks whether this is followed by an
1670     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1671     ".]" or "=]".
1672    
1673     Argument:
1674     ptr pointer to the initial [
1675     endptr where to return the end pointer
1676     cd pointer to compile data
1677    
1678     Returns: TRUE or FALSE
1679     */
1680    
1681     static BOOL
1682     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1683     {
1684     int terminator; /* Don't combine these lines; the Solaris cc */
1685     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1686     if (*(++ptr) == '^') ptr++;
1687     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1688     if (*ptr == terminator && ptr[1] == ']')
1689     {
1690     *endptr = ptr;
1691     return TRUE;
1692     }
1693     return FALSE;
1694     }
1695    
1696    
1697    
1698    
1699     /*************************************************
1700     * Check POSIX class name *
1701     *************************************************/
1702    
1703     /* This function is called to check the name given in a POSIX-style class entry
1704     such as [:alnum:].
1705    
1706     Arguments:
1707     ptr points to the first letter
1708     len the length of the name
1709    
1710     Returns: a value representing the name, or -1 if unknown
1711     */
1712    
1713     static int
1714     check_posix_name(const uschar *ptr, int len)
1715     {
1716     register int yield = 0;
1717     while (posix_name_lengths[yield] != 0)
1718     {
1719     if (len == posix_name_lengths[yield] &&
1720     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1721     yield++;
1722     }
1723     return -1;
1724     }
1725    
1726    
1727     /*************************************************
1728     * Adjust OP_RECURSE items in repeated group *
1729     *************************************************/
1730    
1731     /* OP_RECURSE items contain an offset from the start of the regex to the group
1732     that is referenced. This means that groups can be replicated for fixed
1733     repetition simply by copying (because the recursion is allowed to refer to
1734     earlier groups that are outside the current group). However, when a group is
1735     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1736     it, after it has been compiled. This means that any OP_RECURSE items within it
1737     that refer to the group itself or any contained groups have to have their
1738 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1739     the partially compiled regex must be temporarily terminated with OP_END.
1740 nigel 77
1741 nigel 93 This function has been extended with the possibility of forward references for
1742     recursions and subroutine calls. It must also check the list of such references
1743     for the group we are dealing with. If it finds that one of the recursions in
1744     the current group is on this list, it adjusts the offset in the list, not the
1745     value in the reference (which is a group number).
1746    
1747 nigel 77 Arguments:
1748     group points to the start of the group
1749     adjust the amount by which the group is to be moved
1750     utf8 TRUE in UTF-8 mode
1751     cd contains pointers to tables etc.
1752 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1753 nigel 77
1754     Returns: nothing
1755     */
1756    
1757     static void
1758 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1759     uschar *save_hwm)
1760 nigel 77 {
1761     uschar *ptr = group;
1762     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1763     {
1764 nigel 93 int offset;
1765     uschar *hc;
1766    
1767     /* See if this recursion is on the forward reference list. If so, adjust the
1768     reference. */
1769    
1770     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1771     {
1772     offset = GET(hc, 0);
1773     if (cd->start_code + offset == ptr + 1)
1774     {
1775     PUT(hc, 0, offset + adjust);
1776     break;
1777     }
1778     }
1779    
1780     /* Otherwise, adjust the recursion offset if it's after the start of this
1781     group. */
1782    
1783     if (hc >= cd->hwm)
1784     {
1785     offset = GET(ptr, 1);
1786     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1787     }
1788    
1789 nigel 77 ptr += 1 + LINK_SIZE;
1790     }
1791     }
1792    
1793    
1794    
1795     /*************************************************
1796     * Insert an automatic callout point *
1797     *************************************************/
1798    
1799     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1800     callout points before each pattern item.
1801    
1802     Arguments:
1803     code current code pointer
1804     ptr current pattern pointer
1805     cd pointers to tables etc
1806    
1807     Returns: new code pointer
1808     */
1809    
1810     static uschar *
1811     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1812     {
1813     *code++ = OP_CALLOUT;
1814     *code++ = 255;
1815     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1816     PUT(code, LINK_SIZE, 0); /* Default length */
1817     return code + 2*LINK_SIZE;
1818     }
1819    
1820    
1821    
1822     /*************************************************
1823     * Complete a callout item *
1824     *************************************************/
1825    
1826     /* A callout item contains the length of the next item in the pattern, which
1827     we can't fill in till after we have reached the relevant point. This is used
1828     for both automatic and manual callouts.
1829    
1830     Arguments:
1831     previous_callout points to previous callout item
1832     ptr current pattern pointer
1833     cd pointers to tables etc
1834    
1835     Returns: nothing
1836     */
1837    
1838     static void
1839     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1840     {
1841     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1842     PUT(previous_callout, 2 + LINK_SIZE, length);
1843     }
1844    
1845    
1846    
1847     #ifdef SUPPORT_UCP
1848     /*************************************************
1849     * Get othercase range *
1850     *************************************************/
1851    
1852     /* This function is passed the start and end of a class range, in UTF-8 mode
1853     with UCP support. It searches up the characters, looking for internal ranges of
1854     characters in the "other" case. Each call returns the next one, updating the
1855     start address.
1856    
1857     Arguments:
1858     cptr points to starting character value; updated
1859     d end value
1860     ocptr where to put start of othercase range
1861     odptr where to put end of othercase range
1862    
1863     Yield: TRUE when range returned; FALSE when no more
1864     */
1865    
1866     static BOOL
1867 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1868     unsigned int *odptr)
1869 nigel 77 {
1870 nigel 93 unsigned int c, othercase, next;
1871 nigel 77
1872     for (c = *cptr; c <= d; c++)
1873 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1874 nigel 77
1875     if (c > d) return FALSE;
1876    
1877     *ocptr = othercase;
1878     next = othercase + 1;
1879    
1880     for (++c; c <= d; c++)
1881     {
1882 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1883 nigel 77 next++;
1884     }
1885    
1886     *odptr = next - 1;
1887     *cptr = c;
1888    
1889     return TRUE;
1890     }
1891     #endif /* SUPPORT_UCP */
1892    
1893    
1894 nigel 93
1895 nigel 77 /*************************************************
1896 nigel 93 * Check if auto-possessifying is possible *
1897     *************************************************/
1898    
1899     /* This function is called for unlimited repeats of certain items, to see
1900     whether the next thing could possibly match the repeated item. If not, it makes
1901     sense to automatically possessify the repeated item.
1902    
1903     Arguments:
1904     op_code the repeated op code
1905     this data for this item, depends on the opcode
1906     utf8 TRUE in UTF-8 mode
1907     utf8_char used for utf8 character bytes, NULL if not relevant
1908     ptr next character in pattern
1909     options options bits
1910     cd contains pointers to tables etc.
1911    
1912     Returns: TRUE if possessifying is wanted
1913     */
1914    
1915     static BOOL
1916     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1917     const uschar *ptr, int options, compile_data *cd)
1918     {
1919     int next;
1920    
1921     /* Skip whitespace and comments in extended mode */
1922    
1923     if ((options & PCRE_EXTENDED) != 0)
1924     {
1925     for (;;)
1926     {
1927     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1928     if (*ptr == '#')
1929     {
1930     while (*(++ptr) != 0)
1931     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1932     }
1933     else break;
1934     }
1935     }
1936    
1937     /* If the next item is one that we can handle, get its value. A non-negative
1938     value is a character, a negative value is an escape value. */
1939    
1940     if (*ptr == '\\')
1941     {
1942     int temperrorcode = 0;
1943     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1944     if (temperrorcode != 0) return FALSE;
1945     ptr++; /* Point after the escape sequence */
1946     }
1947    
1948     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1949     {
1950     #ifdef SUPPORT_UTF8
1951     if (utf8) { GETCHARINC(next, ptr); } else
1952     #endif
1953     next = *ptr++;
1954     }
1955    
1956     else return FALSE;
1957    
1958     /* Skip whitespace and comments in extended mode */
1959    
1960     if ((options & PCRE_EXTENDED) != 0)
1961     {
1962     for (;;)
1963     {
1964     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1965     if (*ptr == '#')
1966     {
1967     while (*(++ptr) != 0)
1968     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1969     }
1970     else break;
1971     }
1972     }
1973    
1974     /* If the next thing is itself optional, we have to give up. */
1975    
1976     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1977     return FALSE;
1978    
1979     /* Now compare the next item with the previous opcode. If the previous is a
1980     positive single character match, "item" either contains the character or, if
1981     "item" is greater than 127 in utf8 mode, the character's bytes are in
1982     utf8_char. */
1983    
1984    
1985     /* Handle cases when the next item is a character. */
1986    
1987     if (next >= 0) switch(op_code)
1988     {
1989     case OP_CHAR:
1990     #ifdef SUPPORT_UTF8
1991     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1992     #endif
1993     return item != next;
1994    
1995     /* For CHARNC (caseless character) we must check the other case. If we have
1996     Unicode property support, we can use it to test the other case of
1997     high-valued characters. */
1998    
1999     case OP_CHARNC:
2000     #ifdef SUPPORT_UTF8
2001     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2002     #endif
2003     if (item == next) return FALSE;
2004     #ifdef SUPPORT_UTF8
2005     if (utf8)
2006     {
2007     unsigned int othercase;
2008     if (next < 128) othercase = cd->fcc[next]; else
2009     #ifdef SUPPORT_UCP
2010     othercase = _pcre_ucp_othercase((unsigned int)next);
2011     #else
2012     othercase = NOTACHAR;
2013     #endif
2014     return (unsigned int)item != othercase;
2015     }
2016     else
2017     #endif /* SUPPORT_UTF8 */
2018     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2019    
2020     /* For OP_NOT, "item" must be a single-byte character. */
2021    
2022     case OP_NOT:
2023     if (next < 0) return FALSE; /* Not a character */
2024     if (item == next) return TRUE;
2025     if ((options & PCRE_CASELESS) == 0) return FALSE;
2026     #ifdef SUPPORT_UTF8
2027     if (utf8)
2028     {
2029     unsigned int othercase;
2030     if (next < 128) othercase = cd->fcc[next]; else
2031     #ifdef SUPPORT_UCP
2032     othercase = _pcre_ucp_othercase(next);
2033     #else
2034     othercase = NOTACHAR;
2035     #endif
2036     return (unsigned int)item == othercase;
2037     }
2038     else
2039     #endif /* SUPPORT_UTF8 */
2040     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2041    
2042     case OP_DIGIT:
2043     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2044    
2045     case OP_NOT_DIGIT:
2046     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2047    
2048     case OP_WHITESPACE:
2049     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2050    
2051     case OP_NOT_WHITESPACE:
2052     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2053    
2054     case OP_WORDCHAR:
2055     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2056    
2057     case OP_NOT_WORDCHAR:
2058     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2059    
2060 ph10 180 case OP_HSPACE:
2061     case OP_NOT_HSPACE:
2062     switch(next)
2063     {
2064     case 0x09:
2065     case 0x20:
2066     case 0xa0:
2067     case 0x1680:
2068     case 0x180e:
2069     case 0x2000:
2070     case 0x2001:
2071     case 0x2002:
2072     case 0x2003:
2073     case 0x2004:
2074     case 0x2005:
2075     case 0x2006:
2076     case 0x2007:
2077     case 0x2008:
2078     case 0x2009:
2079     case 0x200A:
2080     case 0x202f:
2081     case 0x205f:
2082     case 0x3000:
2083     return op_code != OP_HSPACE;
2084     default:
2085     return op_code == OP_HSPACE;
2086     }
2087    
2088     case OP_VSPACE:
2089     case OP_NOT_VSPACE:
2090     switch(next)
2091     {
2092     case 0x0a:
2093     case 0x0b:
2094     case 0x0c:
2095     case 0x0d:
2096     case 0x85:
2097     case 0x2028:
2098     case 0x2029:
2099     return op_code != OP_VSPACE;
2100     default:
2101     return op_code == OP_VSPACE;
2102     }
2103    
2104 nigel 93 default:
2105     return FALSE;
2106     }
2107    
2108    
2109     /* Handle the case when the next item is \d, \s, etc. */
2110    
2111     switch(op_code)
2112     {
2113     case OP_CHAR:
2114     case OP_CHARNC:
2115     #ifdef SUPPORT_UTF8
2116     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2117     #endif
2118     switch(-next)
2119     {
2120     case ESC_d:
2121     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2122    
2123     case ESC_D:
2124     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2125    
2126     case ESC_s:
2127     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2128    
2129     case ESC_S:
2130     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2131    
2132     case ESC_w:
2133     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2134    
2135     case ESC_W:
2136     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2137 ph10 182
2138 ph10 180 case ESC_h:
2139     case ESC_H:
2140     switch(item)
2141     {
2142     case 0x09:
2143     case 0x20:
2144     case 0xa0:
2145     case 0x1680:
2146     case 0x180e:
2147     case 0x2000:
2148     case 0x2001:
2149     case 0x2002:
2150     case 0x2003:
2151     case 0x2004:
2152     case 0x2005:
2153     case 0x2006:
2154     case 0x2007:
2155     case 0x2008:
2156     case 0x2009:
2157     case 0x200A:
2158     case 0x202f:
2159     case 0x205f:
2160     case 0x3000:
2161     return -next != ESC_h;
2162     default:
2163     return -next == ESC_h;
2164 ph10 182 }
2165    
2166 ph10 180 case ESC_v:
2167     case ESC_V:
2168     switch(item)
2169     {
2170     case 0x0a:
2171     case 0x0b:
2172     case 0x0c:
2173     case 0x0d:
2174     case 0x85:
2175     case 0x2028:
2176     case 0x2029:
2177     return -next != ESC_v;
2178     default:
2179     return -next == ESC_v;
2180 ph10 182 }
2181 nigel 93
2182     default:
2183     return FALSE;
2184     }
2185    
2186     case OP_DIGIT:
2187 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2188     next == -ESC_h || next == -ESC_v;
2189 nigel 93
2190     case OP_NOT_DIGIT:
2191     return next == -ESC_d;
2192    
2193     case OP_WHITESPACE:
2194     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2195    
2196     case OP_NOT_WHITESPACE:
2197 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2198 nigel 93
2199 ph10 180 case OP_HSPACE:
2200     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2201    
2202     case OP_NOT_HSPACE:
2203     return next == -ESC_h;
2204 ph10 182
2205 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2206 ph10 182 case OP_VSPACE:
2207 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2208    
2209     case OP_NOT_VSPACE:
2210 ph10 182 return next == -ESC_v;
2211 ph10 180
2212 nigel 93 case OP_WORDCHAR:
2213 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2214 nigel 93
2215     case OP_NOT_WORDCHAR:
2216     return next == -ESC_w || next == -ESC_d;
2217 ph10 182
2218 nigel 93 default:
2219     return FALSE;
2220     }
2221    
2222     /* Control does not reach here */
2223     }
2224    
2225    
2226    
2227     /*************************************************
2228 nigel 77 * Compile one branch *
2229     *************************************************/
2230    
2231 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2232 nigel 77 changed during the branch, the pointer is used to change the external options
2233 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2234     to find out the amount of memory needed, as well as during the real compile
2235     phase. The value of lengthptr distinguishes the two phases.
2236 nigel 77
2237     Arguments:
2238     optionsptr pointer to the option bits
2239     codeptr points to the pointer to the current code point
2240     ptrptr points to the current pattern pointer
2241     errorcodeptr points to error code variable
2242     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2243     reqbyteptr set to the last literal character required, else < 0
2244     bcptr points to current branch chain
2245     cd contains pointers to tables etc.
2246 nigel 93 lengthptr NULL during the real compile phase
2247     points to length accumulator during pre-compile phase
2248 nigel 77
2249     Returns: TRUE on success
2250     FALSE, with *errorcodeptr set non-zero on error
2251     */
2252    
2253     static BOOL
2254 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2255     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2256     compile_data *cd, int *lengthptr)
2257 nigel 77 {
2258     int repeat_type, op_type;
2259     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2260     int bravalue = 0;
2261     int greedy_default, greedy_non_default;
2262     int firstbyte, reqbyte;
2263     int zeroreqbyte, zerofirstbyte;
2264     int req_caseopt, reqvary, tempreqvary;
2265     int options = *optionsptr;
2266     int after_manual_callout = 0;
2267 nigel 93 int length_prevgroup = 0;
2268 nigel 77 register int c;
2269     register uschar *code = *codeptr;
2270 nigel 93 uschar *last_code = code;
2271     uschar *orig_code = code;
2272 nigel 77 uschar *tempcode;
2273     BOOL inescq = FALSE;
2274     BOOL groupsetfirstbyte = FALSE;
2275     const uschar *ptr = *ptrptr;
2276     const uschar *tempptr;
2277     uschar *previous = NULL;
2278     uschar *previous_callout = NULL;
2279 nigel 93 uschar *save_hwm = NULL;
2280 nigel 77 uschar classbits[32];
2281    
2282     #ifdef SUPPORT_UTF8
2283     BOOL class_utf8;
2284     BOOL utf8 = (options & PCRE_UTF8) != 0;
2285     uschar *class_utf8data;
2286     uschar utf8_char[6];
2287     #else
2288     BOOL utf8 = FALSE;
2289 nigel 93 uschar *utf8_char = NULL;
2290 nigel 77 #endif
2291    
2292 nigel 93 #ifdef DEBUG
2293     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2294     #endif
2295    
2296 nigel 77 /* Set up the default and non-default settings for greediness */
2297    
2298     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2299     greedy_non_default = greedy_default ^ 1;
2300    
2301     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2302     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2303     matches a non-fixed char first char; reqbyte just remains unset if we never
2304     find one.
2305    
2306     When we hit a repeat whose minimum is zero, we may have to adjust these values
2307     to take the zero repeat into account. This is implemented by setting them to
2308     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2309     item types that can be repeated set these backoff variables appropriately. */
2310    
2311     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2312    
2313     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2314     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2315     value > 255. It is added into the firstbyte or reqbyte variables to record the
2316     case status of the value. This is used only for ASCII characters. */
2317    
2318     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2319    
2320     /* Switch on next character until the end of the branch */
2321    
2322     for (;; ptr++)
2323     {
2324     BOOL negate_class;
2325     BOOL possessive_quantifier;
2326     BOOL is_quantifier;
2327 nigel 93 BOOL is_recurse;
2328 ph10 180 BOOL reset_bracount;
2329 nigel 77 int class_charcount;
2330     int class_lastchar;
2331     int newoptions;
2332     int recno;
2333 ph10 172 int refsign;
2334 nigel 77 int skipbytes;
2335     int subreqbyte;
2336     int subfirstbyte;
2337 nigel 93 int terminator;
2338 nigel 77 int mclength;
2339     uschar mcbuffer[8];
2340    
2341 nigel 93 /* Get next byte in the pattern */
2342 nigel 77
2343     c = *ptr;
2344    
2345 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2346     previous cycle of this loop. */
2347    
2348     if (lengthptr != NULL)
2349     {
2350     #ifdef DEBUG
2351     if (code > cd->hwm) cd->hwm = code; /* High water info */
2352     #endif
2353     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2354     {
2355     *errorcodeptr = ERR52;
2356     goto FAILED;
2357     }
2358    
2359     /* There is at least one situation where code goes backwards: this is the
2360     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2361     the class is simply eliminated. However, it is created first, so we have to
2362     allow memory for it. Therefore, don't ever reduce the length at this point.
2363     */
2364    
2365     if (code < last_code) code = last_code;
2366 ph10 202
2367     /* Paranoid check for integer overflow */
2368    
2369     if (OFLOW_MAX - *lengthptr < code - last_code)
2370     {
2371     *errorcodeptr = ERR20;
2372     goto FAILED;
2373     }
2374    
2375 nigel 93 *lengthptr += code - last_code;
2376     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2377    
2378     /* If "previous" is set and it is not at the start of the work space, move
2379     it back to there, in order to avoid filling up the work space. Otherwise,
2380     if "previous" is NULL, reset the current code pointer to the start. */
2381    
2382     if (previous != NULL)
2383     {
2384     if (previous > orig_code)
2385     {
2386     memmove(orig_code, previous, code - previous);
2387     code -= previous - orig_code;
2388     previous = orig_code;
2389     }
2390     }
2391     else code = orig_code;
2392    
2393     /* Remember where this code item starts so we can pick up the length
2394     next time round. */
2395    
2396     last_code = code;
2397     }
2398    
2399     /* In the real compile phase, just check the workspace used by the forward
2400     reference list. */
2401    
2402     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2403     {
2404     *errorcodeptr = ERR52;
2405     goto FAILED;
2406     }
2407    
2408 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2409    
2410     if (inescq && c != 0)
2411     {
2412     if (c == '\\' && ptr[1] == 'E')
2413     {
2414     inescq = FALSE;
2415     ptr++;
2416     continue;
2417     }
2418     else
2419     {
2420     if (previous_callout != NULL)
2421     {
2422 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2423     complete_callout(previous_callout, ptr, cd);
2424 nigel 77 previous_callout = NULL;
2425     }
2426     if ((options & PCRE_AUTO_CALLOUT) != 0)
2427     {
2428     previous_callout = code;
2429     code = auto_callout(code, ptr, cd);
2430     }
2431     goto NORMAL_CHAR;
2432     }
2433     }
2434    
2435     /* Fill in length of a previous callout, except when the next thing is
2436     a quantifier. */
2437    
2438     is_quantifier = c == '*' || c == '+' || c == '?' ||
2439     (c == '{' && is_counted_repeat(ptr+1));
2440    
2441     if (!is_quantifier && previous_callout != NULL &&
2442     after_manual_callout-- <= 0)
2443     {
2444 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2445     complete_callout(previous_callout, ptr, cd);
2446 nigel 77 previous_callout = NULL;
2447     }
2448    
2449     /* In extended mode, skip white space and comments */
2450    
2451     if ((options & PCRE_EXTENDED) != 0)
2452     {
2453     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2454     if (c == '#')
2455     {
2456 nigel 93 while (*(++ptr) != 0)
2457 nigel 91 {
2458 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2459 nigel 91 }
2460 nigel 93 if (*ptr != 0) continue;
2461    
2462 nigel 91 /* Else fall through to handle end of string */
2463     c = 0;
2464 nigel 77 }
2465     }
2466    
2467     /* No auto callout for quantifiers. */
2468    
2469     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2470     {
2471     previous_callout = code;
2472     code = auto_callout(code, ptr, cd);
2473     }
2474    
2475     switch(c)
2476     {
2477 nigel 93 /* ===================================================================*/
2478     case 0: /* The branch terminates at string end */
2479     case '|': /* or | or ) */
2480 nigel 77 case ')':
2481     *firstbyteptr = firstbyte;
2482     *reqbyteptr = reqbyte;
2483     *codeptr = code;
2484     *ptrptr = ptr;
2485 nigel 93 if (lengthptr != NULL)
2486     {
2487 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2488     {
2489     *errorcodeptr = ERR20;
2490     goto FAILED;
2491     }
2492 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2493     DPRINTF((">> end branch\n"));
2494     }
2495 nigel 77 return TRUE;
2496    
2497 nigel 93
2498     /* ===================================================================*/
2499 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2500     the setting of any following char as a first character. */
2501    
2502     case '^':
2503     if ((options & PCRE_MULTILINE) != 0)
2504     {
2505     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2506     }
2507     previous = NULL;
2508     *code++ = OP_CIRC;
2509     break;
2510    
2511     case '$':
2512     previous = NULL;
2513     *code++ = OP_DOLL;
2514     break;
2515    
2516     /* There can never be a first char if '.' is first, whatever happens about
2517     repeats. The value of reqbyte doesn't change either. */
2518    
2519     case '.':
2520     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2521     zerofirstbyte = firstbyte;
2522     zeroreqbyte = reqbyte;
2523     previous = code;
2524     *code++ = OP_ANY;
2525     break;
2526    
2527 nigel 93
2528     /* ===================================================================*/
2529 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2530     32-byte bitmap of the permitted characters, except in the special case
2531     where there is only one such character. For negated classes, we build the
2532     map as usual, then invert it at the end. However, we use a different opcode
2533     so that data characters > 255 can be handled correctly.
2534 nigel 77
2535     If the class contains characters outside the 0-255 range, a different
2536     opcode is compiled. It may optionally have a bit map for characters < 256,
2537     but those above are are explicitly listed afterwards. A flag byte tells
2538     whether the bitmap is present, and whether this is a negated class or not.
2539     */
2540    
2541     case '[':
2542     previous = code;
2543    
2544     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2545     they are encountered at the top level, so we'll do that too. */
2546    
2547     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2548     check_posix_syntax(ptr, &tempptr, cd))
2549     {
2550     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2551     goto FAILED;
2552     }
2553    
2554 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2555 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2556 ph10 205 skip them too. This makes for compatibility with Perl. */
2557 ph10 208
2558 ph10 205 negate_class = FALSE;
2559     for (;;)
2560 nigel 77 {
2561     c = *(++ptr);
2562 ph10 205 if (c == '\\')
2563     {
2564 ph10 208 if (ptr[1] == 'E') ptr++;
2565 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2566 ph10 208 else break;
2567 ph10 205 }
2568     else if (!negate_class && c == '^')
2569     negate_class = TRUE;
2570     else break;
2571 ph10 208 }
2572 nigel 77
2573     /* Keep a count of chars with values < 256 so that we can optimize the case
2574 nigel 93 of just a single character (as long as it's < 256). However, For higher
2575     valued UTF-8 characters, we don't yet do any optimization. */
2576 nigel 77
2577     class_charcount = 0;
2578     class_lastchar = -1;
2579    
2580 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2581     temporary bit of memory, in case the class contains only 1 character (less
2582     than 256), because in that case the compiled code doesn't use the bit map.
2583     */
2584    
2585     memset(classbits, 0, 32 * sizeof(uschar));
2586    
2587 nigel 77 #ifdef SUPPORT_UTF8
2588     class_utf8 = FALSE; /* No chars >= 256 */
2589 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2590 nigel 77 #endif
2591    
2592     /* Process characters until ] is reached. By writing this as a "do" it
2593 nigel 93 means that an initial ] is taken as a data character. At the start of the
2594     loop, c contains the first byte of the character. */
2595 nigel 77
2596 nigel 93 if (c != 0) do
2597 nigel 77 {
2598 nigel 93 const uschar *oldptr;
2599    
2600 nigel 77 #ifdef SUPPORT_UTF8
2601     if (utf8 && c > 127)
2602     { /* Braces are required because the */
2603     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2604     }
2605     #endif
2606    
2607     /* Inside \Q...\E everything is literal except \E */
2608    
2609     if (inescq)
2610     {
2611 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2612 nigel 77 {
2613 nigel 93 inescq = FALSE; /* Reset literal state */
2614     ptr++; /* Skip the 'E' */
2615     continue; /* Carry on with next */
2616 nigel 77 }
2617 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2618 nigel 77 }
2619    
2620     /* Handle POSIX class names. Perl allows a negation extension of the
2621     form [:^name:]. A square bracket that doesn't match the syntax is
2622     treated as a literal. We also recognize the POSIX constructions
2623     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2624     5.6 and 5.8 do. */
2625    
2626     if (c == '[' &&
2627     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2628     check_posix_syntax(ptr, &tempptr, cd))
2629     {
2630     BOOL local_negate = FALSE;
2631 nigel 87 int posix_class, taboffset, tabopt;
2632 nigel 77 register const uschar *cbits = cd->cbits;
2633 nigel 87 uschar pbits[32];
2634 nigel 77
2635     if (ptr[1] != ':')
2636     {
2637     *errorcodeptr = ERR31;
2638     goto FAILED;
2639     }
2640    
2641     ptr += 2;
2642     if (*ptr == '^')
2643     {
2644     local_negate = TRUE;
2645     ptr++;
2646     }
2647    
2648     posix_class = check_posix_name(ptr, tempptr - ptr);
2649     if (posix_class < 0)
2650     {
2651     *errorcodeptr = ERR30;
2652     goto FAILED;
2653     }
2654    
2655     /* If matching is caseless, upper and lower are converted to
2656     alpha. This relies on the fact that the class table starts with
2657     alpha, lower, upper as the first 3 entries. */
2658    
2659     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2660     posix_class = 0;
2661    
2662 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2663     because we may be adding and subtracting from it, and we don't want to
2664     subtract bits that may be in the main map already. At the end we or the
2665     result into the bit map that is being built. */
2666 nigel 77
2667     posix_class *= 3;
2668 nigel 87
2669     /* Copy in the first table (always present) */
2670    
2671     memcpy(pbits, cbits + posix_class_maps[posix_class],
2672     32 * sizeof(uschar));
2673    
2674     /* If there is a second table, add or remove it as required. */
2675    
2676     taboffset = posix_class_maps[posix_class + 1];
2677     tabopt = posix_class_maps[posix_class + 2];
2678    
2679     if (taboffset >= 0)
2680 nigel 77 {
2681 nigel 87 if (tabopt >= 0)
2682     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2683 nigel 77 else
2684 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2685 nigel 77 }
2686    
2687 nigel 87 /* Not see if we need to remove any special characters. An option
2688     value of 1 removes vertical space and 2 removes underscore. */
2689    
2690     if (tabopt < 0) tabopt = -tabopt;
2691     if (tabopt == 1) pbits[1] &= ~0x3c;
2692     else if (tabopt == 2) pbits[11] &= 0x7f;
2693    
2694     /* Add the POSIX table or its complement into the main table that is
2695     being built and we are done. */
2696    
2697     if (local_negate)
2698     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2699     else
2700     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2701    
2702 nigel 77 ptr = tempptr + 1;
2703     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2704     continue; /* End of POSIX syntax handling */
2705     }
2706    
2707     /* Backslash may introduce a single character, or it may introduce one
2708 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2709     case. Inside a class (and only there) it is treated as backspace.
2710     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2711 ph10 205 to 'or' into the one we are building. We assume they have more than one
2712 nigel 77 character in them, so set class_charcount bigger than one. */
2713    
2714     if (c == '\\')
2715     {
2716 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2717     if (*errorcodeptr != 0) goto FAILED;
2718 nigel 77
2719     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2720     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2721 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2722 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2723     {
2724     if (ptr[1] == '\\' && ptr[2] == 'E')
2725     {
2726     ptr += 2; /* avoid empty string */
2727     }
2728     else inescq = TRUE;
2729     continue;
2730     }
2731 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2732 nigel 77
2733     if (c < 0)
2734     {
2735     register const uschar *cbits = cd->cbits;
2736     class_charcount += 2; /* Greater than 1 is what matters */
2737 nigel 93
2738     /* Save time by not doing this in the pre-compile phase. */
2739    
2740     if (lengthptr == NULL) switch (-c)
2741 nigel 77 {
2742     case ESC_d:
2743     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2744     continue;
2745    
2746     case ESC_D:
2747     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2748     continue;
2749    
2750     case ESC_w:
2751     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2752     continue;
2753    
2754     case ESC_W:
2755     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2756     continue;
2757    
2758     case ESC_s:
2759     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2760     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2761     continue;
2762    
2763     case ESC_S:
2764     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2765     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2766     continue;
2767    
2768 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2769     continue;
2770 ph10 180
2771 nigel 93 default: /* Not recognized; fall through */
2772     break; /* Need "default" setting to stop compiler warning. */
2773     }
2774    
2775     /* In the pre-compile phase, just do the recognition. */
2776    
2777     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2778     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2779 ph10 180
2780 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2781     they use extra memory. */
2782 ph10 180
2783 ph10 178 if (-c == ESC_h)
2784     {
2785     SETBIT(classbits, 0x09); /* VT */
2786     SETBIT(classbits, 0x20); /* SPACE */
2787 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2788 ph10 178 #ifdef SUPPORT_UTF8
2789     if (utf8)
2790 ph10 180 {
2791 ph10 178 class_utf8 = TRUE;
2792     *class_utf8data++ = XCL_SINGLE;
2793 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2794 ph10 178 *class_utf8data++ = XCL_SINGLE;
2795 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2796     *class_utf8data++ = XCL_RANGE;
2797     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2798     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2799 ph10 178 *class_utf8data++ = XCL_SINGLE;
2800 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2801 ph10 178 *class_utf8data++ = XCL_SINGLE;
2802 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2803 ph10 178 *class_utf8data++ = XCL_SINGLE;
2804 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2805     }
2806     #endif
2807     continue;
2808     }
2809 nigel 93
2810 ph10 178 if (-c == ESC_H)
2811     {
2812     for (c = 0; c < 32; c++)
2813     {
2814     int x = 0xff;
2815     switch (c)
2816 ph10 180 {
2817 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2818     case 0x20/8: x ^= 1 << (0x20%8); break;
2819     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2820     default: break;
2821     }
2822     classbits[c] |= x;
2823 ph10 180 }
2824    
2825 ph10 178 #ifdef SUPPORT_UTF8
2826     if (utf8)
2827 ph10 180 {
2828 ph10 178 class_utf8 = TRUE;
2829 ph10 180 *class_utf8data++ = XCL_RANGE;
2830     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2831     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2832     *class_utf8data++ = XCL_RANGE;
2833     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2834     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2835     *class_utf8data++ = XCL_RANGE;
2836     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2837     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2838     *class_utf8data++ = XCL_RANGE;
2839     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2840     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2841     *class_utf8data++ = XCL_RANGE;
2842     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2843     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2844     *class_utf8data++ = XCL_RANGE;
2845     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2846     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2847     *class_utf8data++ = XCL_RANGE;
2848     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2849     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2850     }
2851     #endif
2852     continue;
2853     }
2854 ph10 178
2855     if (-c == ESC_v)
2856     {
2857     SETBIT(classbits, 0x0a); /* LF */
2858     SETBIT(classbits, 0x0b); /* VT */
2859 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2860     SETBIT(classbits, 0x0d); /* CR */
2861     SETBIT(classbits, 0x85); /* NEL */
2862 ph10 178 #ifdef SUPPORT_UTF8
2863     if (utf8)
2864 ph10 180 {
2865 ph10 178 class_utf8 = TRUE;
2866 ph10 180 *class_utf8data++ = XCL_RANGE;
2867     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2868     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2869     }
2870     #endif
2871     continue;
2872     }
2873 ph10 178
2874     if (-c == ESC_V)
2875     {
2876     for (c = 0; c < 32; c++)
2877     {
2878     int x = 0xff;
2879     switch (c)
2880 ph10 180 {
2881 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2882     x ^= 1 << (0x0b%8);
2883     x ^= 1 << (0x0c%8);
2884 ph10 180 x ^= 1 << (0x0d%8);
2885 ph10 178 break;
2886     case 0x85/8: x ^= 1 << (0x85%8); break;
2887     default: break;
2888     }
2889     classbits[c] |= x;
2890 ph10 180 }
2891    
2892 ph10 178 #ifdef SUPPORT_UTF8
2893     if (utf8)
2894 ph10 180 {
2895 ph10 178 class_utf8 = TRUE;
2896 ph10 180 *class_utf8data++ = XCL_RANGE;
2897     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2898     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2899     *class_utf8data++ = XCL_RANGE;
2900     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2901     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2902     }
2903     #endif
2904     continue;
2905     }
2906 ph10 178
2907 nigel 93 /* We need to deal with \P and \p in both phases. */
2908    
2909 nigel 77 #ifdef SUPPORT_UCP
2910 nigel 93 if (-c == ESC_p || -c == ESC_P)
2911     {
2912     BOOL negated;
2913     int pdata;
2914     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2915     if (ptype < 0) goto FAILED;
2916     class_utf8 = TRUE;
2917     *class_utf8data++ = ((-c == ESC_p) != negated)?
2918     XCL_PROP : XCL_NOTPROP;
2919     *class_utf8data++ = ptype;
2920     *class_utf8data++ = pdata;
2921     class_charcount -= 2; /* Not a < 256 character */
2922 nigel 77 continue;
2923 nigel 93 }
2924 nigel 77 #endif
2925 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2926     strict mode. By default, for compatibility with Perl, they are
2927     treated as literals. */
2928 nigel 77
2929 nigel 93 if ((options & PCRE_EXTRA) != 0)
2930     {
2931     *errorcodeptr = ERR7;
2932     goto FAILED;
2933     }
2934 nigel 77
2935 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2936     c = *ptr; /* Get the final character and fall through */
2937 nigel 77 }
2938    
2939     /* Fall through if we have a single character (c >= 0). This may be
2940 nigel 93 greater than 256 in UTF-8 mode. */
2941 nigel 77
2942     } /* End of backslash handling */
2943    
2944     /* A single character may be followed by '-' to form a range. However,
2945     Perl does not permit ']' to be the end of the range. A '-' character
2946 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2947     entirely. The code for handling \Q and \E is messy. */
2948 nigel 77
2949 nigel 93 CHECK_RANGE:
2950     while (ptr[1] == '\\' && ptr[2] == 'E')
2951 nigel 77 {
2952 nigel 93 inescq = FALSE;
2953     ptr += 2;
2954     }
2955    
2956     oldptr = ptr;
2957    
2958     if (!inescq && ptr[1] == '-')
2959     {
2960 nigel 77 int d;
2961     ptr += 2;
2962 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2963 nigel 77
2964 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2965     mode. */
2966    
2967     while (*ptr == '\\' && ptr[1] == 'Q')
2968     {
2969     ptr += 2;
2970     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2971     inescq = TRUE;
2972     break;
2973     }
2974    
2975     if (*ptr == 0 || (!inescq && *ptr == ']'))
2976     {
2977     ptr = oldptr;
2978     goto LONE_SINGLE_CHARACTER;
2979     }
2980    
2981 nigel 77 #ifdef SUPPORT_UTF8
2982     if (utf8)
2983     { /* Braces are required because the */
2984     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2985     }
2986     else
2987     #endif
2988     d = *ptr; /* Not UTF-8 mode */
2989    
2990     /* The second part of a range can be a single-character escape, but
2991     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2992     in such circumstances. */
2993    
2994 nigel 93 if (!inescq && d == '\\')
2995 nigel 77 {
2996 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2997     if (*errorcodeptr != 0) goto FAILED;
2998 nigel 77
2999 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
3000     special means the '-' was literal */
3001 nigel 77
3002     if (d < 0)
3003     {
3004     if (d == -ESC_b) d = '\b';
3005 nigel 93 else if (d == -ESC_X) d = 'X';
3006     else if (d == -ESC_R) d = 'R'; else
3007 nigel 77 {
3008 nigel 93 ptr = oldptr;
3009 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3010     }
3011     }
3012     }
3013    
3014 nigel 93 /* Check that the two values are in the correct order. Optimize
3015     one-character ranges */
3016 nigel 77
3017 nigel 93 if (d < c)
3018     {
3019     *errorcodeptr = ERR8;
3020     goto FAILED;
3021     }
3022    
3023 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3024    
3025     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3026     matching, we have to use an XCLASS with extra data items. Caseless
3027     matching for characters > 127 is available only if UCP support is
3028     available. */
3029    
3030     #ifdef SUPPORT_UTF8
3031     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3032     {
3033     class_utf8 = TRUE;
3034    
3035     /* With UCP support, we can find the other case equivalents of
3036     the relevant characters. There may be several ranges. Optimize how
3037     they fit with the basic range. */
3038    
3039     #ifdef SUPPORT_UCP
3040     if ((options & PCRE_CASELESS) != 0)
3041     {
3042 nigel 93 unsigned int occ, ocd;
3043     unsigned int cc = c;
3044     unsigned int origd = d;
3045 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3046     {
3047 ph10 180 if (occ >= (unsigned int)c &&
3048     ocd <= (unsigned int)d)
3049 ph10 176 continue; /* Skip embedded ranges */
3050 nigel 77
3051 ph10 180 if (occ < (unsigned int)c &&
3052 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3053 nigel 77 { /* if there is overlap, */
3054     c = occ; /* noting that if occ < c */
3055     continue; /* we can't have ocd > d */
3056     } /* because a subrange is */
3057 ph10 180 if (ocd > (unsigned int)d &&
3058 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3059 nigel 77 { /* the basic range. */
3060     d = ocd;
3061     continue;
3062     }
3063    
3064     if (occ == ocd)
3065     {
3066     *class_utf8data++ = XCL_SINGLE;
3067     }
3068     else
3069     {
3070     *class_utf8data++ = XCL_RANGE;
3071     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3072     }
3073     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3074     }
3075     }
3076     #endif /* SUPPORT_UCP */
3077    
3078     /* Now record the original range, possibly modified for UCP caseless
3079     overlapping ranges. */
3080    
3081     *class_utf8data++ = XCL_RANGE;
3082     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3083     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3084    
3085     /* With UCP support, we are done. Without UCP support, there is no
3086     caseless matching for UTF-8 characters > 127; we can use the bit map
3087     for the smaller ones. */
3088    
3089     #ifdef SUPPORT_UCP
3090     continue; /* With next character in the class */
3091     #else
3092     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3093    
3094     /* Adjust upper limit and fall through to set up the map */
3095    
3096     d = 127;
3097    
3098     #endif /* SUPPORT_UCP */
3099     }
3100     #endif /* SUPPORT_UTF8 */
3101    
3102     /* We use the bit map for all cases when not in UTF-8 mode; else
3103     ranges that lie entirely within 0-127 when there is UCP support; else
3104     for partial ranges without UCP support. */
3105    
3106 nigel 93 class_charcount += d - c + 1;
3107     class_lastchar = d;
3108    
3109     /* We can save a bit of time by skipping this in the pre-compile. */
3110    
3111     if (lengthptr == NULL) for (; c <= d; c++)
3112 nigel 77 {
3113     classbits[c/8] |= (1 << (c&7));
3114     if ((options & PCRE_CASELESS) != 0)
3115     {
3116     int uc = cd->fcc[c]; /* flip case */
3117     classbits[uc/8] |= (1 << (uc&7));
3118     }
3119     }
3120    
3121     continue; /* Go get the next char in the class */
3122     }
3123    
3124     /* Handle a lone single character - we can get here for a normal
3125     non-escape char, or after \ that introduces a single character or for an
3126     apparent range that isn't. */
3127    
3128     LONE_SINGLE_CHARACTER:
3129    
3130     /* Handle a character that cannot go in the bit map */
3131    
3132     #ifdef SUPPORT_UTF8
3133     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3134     {
3135     class_utf8 = TRUE;
3136     *class_utf8data++ = XCL_SINGLE;
3137     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3138    
3139     #ifdef SUPPORT_UCP
3140     if ((options & PCRE_CASELESS) != 0)
3141     {
3142 nigel 93 unsigned int othercase;
3143     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3144 nigel 77 {
3145     *class_utf8data++ = XCL_SINGLE;
3146     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3147     }
3148     }
3149     #endif /* SUPPORT_UCP */
3150    
3151     }
3152     else
3153     #endif /* SUPPORT_UTF8 */
3154    
3155     /* Handle a single-byte character */
3156     {
3157     classbits[c/8] |= (1 << (c&7));
3158     if ((options & PCRE_CASELESS) != 0)
3159     {
3160     c = cd->fcc[c]; /* flip case */
3161     classbits[c/8] |= (1 << (c&7));
3162     }
3163     class_charcount++;
3164     class_lastchar = c;
3165     }
3166     }
3167    
3168 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3169 nigel 77
3170 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3171 nigel 77
3172 nigel 93 if (c == 0) /* Missing terminating ']' */
3173     {
3174     *errorcodeptr = ERR6;
3175     goto FAILED;
3176     }
3177 ph10 208
3178 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3179 ph10 223 less than 256. As long as there were no characters >= 128 and there was no
3180     use of \p or \P, in other words, no use of any XCLASS features, we can
3181     optimize.
3182    
3183     In UTF-8 mode, we can optimize the negative case only if there were no
3184     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3185     operate on single-bytes only. This is an historical hangover. Maybe one day
3186     we can tidy these opcodes to handle multi-byte characters.
3187 nigel 77
3188     The optimization throws away the bit map. We turn the item into a
3189     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3190     that OP_NOT does not support multibyte characters. In the positive case, it
3191     can cause firstbyte to be set. Otherwise, there can be no first char if
3192     this item is first, whatever repeat count may follow. In the case of
3193     reqbyte, save the previous value for reinstating. */
3194    
3195     #ifdef SUPPORT_UTF8
3196 ph10 223 if (class_charcount == 1 && !class_utf8 &&
3197     (!utf8 || !negate_class || class_lastchar < 128))
3198 nigel 77 #else
3199     if (class_charcount == 1)
3200     #endif
3201     {
3202     zeroreqbyte = reqbyte;
3203    
3204     /* The OP_NOT opcode works on one-byte characters only. */
3205    
3206     if (negate_class)
3207     {
3208     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3209     zerofirstbyte = firstbyte;
3210     *code++ = OP_NOT;
3211     *code++ = class_lastchar;
3212     break;
3213     }
3214    
3215     /* For a single, positive character, get the value into mcbuffer, and
3216     then we can handle this with the normal one-character code. */
3217    
3218     #ifdef SUPPORT_UTF8
3219     if (utf8 && class_lastchar > 127)
3220     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3221     else
3222     #endif
3223     {
3224     mcbuffer[0] = class_lastchar;
3225     mclength = 1;
3226     }
3227     goto ONE_CHAR;
3228     } /* End of 1-char optimization */
3229    
3230     /* The general case - not the one-char optimization. If this is the first
3231     thing in the branch, there can be no first char setting, whatever the
3232     repeat count. Any reqbyte setting must remain unchanged after any kind of
3233     repeat. */
3234    
3235     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3236     zerofirstbyte = firstbyte;
3237     zeroreqbyte = reqbyte;
3238    
3239     /* If there are characters with values > 255, we have to compile an
3240     extended class, with its own opcode. If there are no characters < 256,
3241 nigel 93 we can omit the bitmap in the actual compiled code. */
3242 nigel 77
3243     #ifdef SUPPORT_UTF8
3244     if (class_utf8)
3245     {
3246     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3247     *code++ = OP_XCLASS;
3248     code += LINK_SIZE;
3249     *code = negate_class? XCL_NOT : 0;
3250    
3251 nigel 93 /* If the map is required, move up the extra data to make room for it;
3252     otherwise just move the code pointer to the end of the extra data. */
3253 nigel 77
3254     if (class_charcount > 0)
3255     {
3256     *code++ |= XCL_MAP;
3257 nigel 93 memmove(code + 32, code, class_utf8data - code);
3258 nigel 77 memcpy(code, classbits, 32);
3259 nigel 93 code = class_utf8data + 32;
3260 nigel 77 }
3261 nigel 93 else code = class_utf8data;
3262 nigel 77
3263     /* Now fill in the complete length of the item */
3264    
3265     PUT(previous, 1, code - previous);
3266     break; /* End of class handling */
3267     }
3268     #endif
3269    
3270     /* If there are no characters > 255, negate the 32-byte map if necessary,
3271     and copy it into the code vector. If this is the first thing in the branch,
3272     there can be no first char setting, whatever the repeat count. Any reqbyte
3273     setting must remain unchanged after any kind of repeat. */
3274    
3275     if (negate_class)
3276     {
3277     *code++ = OP_NCLASS;
3278 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3279     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3280 nigel 77 }
3281     else
3282     {
3283     *code++ = OP_CLASS;
3284     memcpy(code, classbits, 32);
3285     }
3286     code += 32;
3287     break;
3288    
3289 nigel 93
3290     /* ===================================================================*/
3291 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3292     has been tested above. */
3293    
3294     case '{':
3295     if (!is_quantifier) goto NORMAL_CHAR;
3296     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3297     if (*errorcodeptr != 0) goto FAILED;
3298     goto REPEAT;
3299    
3300     case '*':
3301     repeat_min = 0;
3302     repeat_max = -1;
3303     goto REPEAT;
3304    
3305     case '+':
3306     repeat_min = 1;
3307     repeat_max = -1;
3308     goto REPEAT;
3309    
3310     case '?':
3311     repeat_min = 0;
3312     repeat_max = 1;
3313    
3314     REPEAT:
3315     if (previous == NULL)
3316     {
3317     *errorcodeptr = ERR9;
3318     goto FAILED;
3319     }
3320    
3321     if (repeat_min == 0)
3322     {
3323     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3324     reqbyte = zeroreqbyte; /* Ditto */
3325     }
3326    
3327     /* Remember whether this is a variable length repeat */
3328    
3329     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3330    
3331     op_type = 0; /* Default single-char op codes */
3332     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3333    
3334     /* Save start of previous item, in case we have to move it up to make space
3335     for an inserted OP_ONCE for the additional '+' extension. */
3336    
3337     tempcode = previous;
3338    
3339     /* If the next character is '+', we have a possessive quantifier. This
3340     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3341     If the next character is '?' this is a minimizing repeat, by default,
3342     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3343     repeat type to the non-default. */
3344    
3345     if (ptr[1] == '+')
3346     {
3347     repeat_type = 0; /* Force greedy */
3348     possessive_quantifier = TRUE;
3349     ptr++;
3350     }
3351     else if (ptr[1] == '?')
3352     {
3353     repeat_type = greedy_non_default;
3354     ptr++;
3355     }
3356     else repeat_type = greedy_default;
3357    
3358     /* If previous was a character match, abolish the item and generate a
3359     repeat item instead. If a char item has a minumum of more than one, ensure
3360     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3361     the first thing in a branch because the x will have gone into firstbyte
3362     instead. */
3363    
3364     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3365     {
3366     /* Deal with UTF-8 characters that take up more than one byte. It's
3367     easier to write this out separately than try to macrify it. Use c to
3368     hold the length of the character in bytes, plus 0x80 to flag that it's a
3369     length rather than a small character. */
3370    
3371     #ifdef SUPPORT_UTF8
3372     if (utf8 && (code[-1] & 0x80) != 0)
3373     {
3374     uschar *lastchar = code - 1;
3375     while((*lastchar & 0xc0) == 0x80) lastchar--;
3376     c = code - lastchar; /* Length of UTF-8 character */
3377     memcpy(utf8_char, lastchar, c); /* Save the char */
3378     c |= 0x80; /* Flag c as a length */
3379     }
3380     else
3381     #endif
3382    
3383     /* Handle the case of a single byte - either with no UTF8 support, or
3384     with UTF-8 disabled, or for a UTF-8 character < 128. */
3385    
3386     {
3387     c = code[-1];
3388     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3389     }
3390    
3391 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3392     the line is something that cannot possibly match this character. If so,
3393     automatically possessifying this item gains some performance in the case
3394     where the match fails. */
3395    
3396     if (!possessive_quantifier &&
3397     repeat_max < 0 &&
3398     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3399     options, cd))
3400     {
3401     repeat_type = 0; /* Force greedy */
3402     possessive_quantifier = TRUE;
3403     }
3404    
3405 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3406     }
3407    
3408     /* If previous was a single negated character ([^a] or similar), we use
3409     one of the special opcodes, replacing it. The code is shared with single-
3410     character repeats by setting opt_type to add a suitable offset into
3411 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3412     currently used only for single-byte chars. */
3413 nigel 77
3414     else if (*previous == OP_NOT)
3415     {
3416     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3417     c = previous[1];
3418 nigel 93 if (!possessive_quantifier &&
3419     repeat_max < 0 &&
3420     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3421     {
3422     repeat_type = 0; /* Force greedy */
3423     possessive_quantifier = TRUE;
3424     }
3425 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3426     }
3427    
3428     /* If previous was a character type match (\d or similar), abolish it and
3429     create a suitable repeat item. The code is shared with single-character
3430     repeats by setting op_type to add a suitable offset into repeat_type. Note
3431     the the Unicode property types will be present only when SUPPORT_UCP is
3432     defined, but we don't wrap the little bits of code here because it just
3433     makes it horribly messy. */
3434    
3435     else if (*previous < OP_EODN)
3436     {
3437     uschar *oldcode;
3438 nigel 87 int prop_type, prop_value;
3439 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3440     c = *previous;
3441    
3442 nigel 93 if (!possessive_quantifier &&
3443     repeat_max < 0 &&
3444     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3445     {
3446     repeat_type = 0; /* Force greedy */
3447     possessive_quantifier = TRUE;
3448     }
3449    
3450 nigel 77 OUTPUT_SINGLE_REPEAT:
3451 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3452     {
3453     prop_type = previous[1];
3454     prop_value = previous[2];
3455     }
3456     else prop_type = prop_value = -1;
3457 nigel 77
3458     oldcode = code;
3459     code = previous; /* Usually overwrite previous item */
3460    
3461     /* If the maximum is zero then the minimum must also be zero; Perl allows
3462     this case, so we do too - by simply omitting the item altogether. */
3463    
3464     if (repeat_max == 0) goto END_REPEAT;
3465    
3466     /* All real repeats make it impossible to handle partial matching (maybe
3467     one day we will be able to remove this restriction). */
3468    
3469     if (repeat_max != 1) cd->nopartial = TRUE;
3470    
3471     /* Combine the op_type with the repeat_type */
3472    
3473     repeat_type += op_type;
3474    
3475     /* A minimum of zero is handled either as the special case * or ?, or as
3476     an UPTO, with the maximum given. */
3477    
3478     if (repeat_min == 0)
3479     {
3480     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3481     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3482     else
3483     {
3484     *code++ = OP_UPTO + repeat_type;
3485     PUT2INC(code, 0, repeat_max);
3486     }
3487     }
3488    
3489     /* A repeat minimum of 1 is optimized into some special cases. If the
3490 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3491 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3492     one less than the maximum. */
3493    
3494     else if (repeat_min == 1)
3495     {
3496     if (repeat_max == -1)
3497     *code++ = OP_PLUS + repeat_type;
3498     else
3499     {
3500     code = oldcode; /* leave previous item in place */
3501     if (repeat_max == 1) goto END_REPEAT;
3502     *code++ = OP_UPTO + repeat_type;
3503     PUT2INC(code, 0, repeat_max - 1);
3504     }
3505     }
3506    
3507     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3508     handled as an EXACT followed by an UPTO. */
3509    
3510     else
3511     {
3512     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3513     PUT2INC(code, 0, repeat_min);
3514    
3515     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3516     we have to insert the character for the previous code. For a repeated
3517 nigel 87 Unicode property match, there are two extra bytes that define the
3518 nigel 77 required property. In UTF-8 mode, long characters have their length in
3519     c, with the 0x80 bit as a flag. */
3520    
3521     if (repeat_max < 0)
3522     {
3523     #ifdef SUPPORT_UTF8
3524     if (utf8 && c >= 128)
3525     {
3526     memcpy(code, utf8_char, c & 7);
3527     code += c & 7;
3528     }
3529     else
3530     #endif
3531     {
3532     *code++ = c;
3533 nigel 87 if (prop_type >= 0)
3534     {
3535     *code++ = prop_type;
3536     *code++ = prop_value;
3537     }
3538 nigel 77 }
3539     *code++ = OP_STAR + repeat_type;
3540     }
3541    
3542     /* Else insert an UPTO if the max is greater than the min, again
3543 nigel 93 preceded by the character, for the previously inserted code. If the
3544     UPTO is just for 1 instance, we can use QUERY instead. */
3545 nigel 77
3546     else if (repeat_max != repeat_min)
3547     {
3548     #ifdef SUPPORT_UTF8
3549     if (utf8 && c >= 128)
3550     {
3551     memcpy(code, utf8_char, c & 7);
3552     code += c & 7;
3553     }
3554     else
3555     #endif
3556     *code++ = c;
3557 nigel 87 if (prop_type >= 0)
3558     {
3559     *code++ = prop_type;
3560     *code++ = prop_value;
3561     }
3562 nigel 77 repeat_max -= repeat_min;
3563 nigel 93
3564     if (repeat_max == 1)
3565     {
3566     *code++ = OP_QUERY + repeat_type;
3567     }
3568     else
3569     {
3570     *code++ = OP_UPTO + repeat_type;
3571     PUT2INC(code, 0, repeat_max);
3572     }
3573 nigel 77 }
3574     }
3575    
3576     /* The character or character type itself comes last in all cases. */
3577    
3578     #ifdef SUPPORT_UTF8
3579     if (utf8 && c >= 128)
3580     {
3581     memcpy(code, utf8_char, c & 7);
3582     code += c & 7;
3583     }
3584     else
3585     #endif
3586     *code++ = c;
3587    
3588 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3589     define the required property. */
3590 nigel 77
3591     #ifdef SUPPORT_UCP
3592 nigel 87 if (prop_type >= 0)
3593     {
3594     *code++ = prop_type;
3595     *code++ = prop_value;
3596     }
3597 nigel 77 #endif
3598     }
3599    
3600     /* If previous was a character class or a back reference, we put the repeat
3601     stuff after it, but just skip the item if the repeat was {0,0}. */
3602    
3603     else if (*previous == OP_CLASS ||
3604     *previous == OP_NCLASS ||
3605     #ifdef SUPPORT_UTF8
3606     *previous == OP_XCLASS ||
3607     #endif
3608     *previous == OP_REF)
3609     {
3610     if (repeat_max == 0)
3611     {
3612     code = previous;
3613     goto END_REPEAT;
3614     }
3615    
3616     /* All real repeats make it impossible to handle partial matching (maybe
3617     one day we will be able to remove this restriction). */
3618    
3619     if (repeat_max != 1) cd->nopartial = TRUE;
3620    
3621     if (repeat_min == 0 && repeat_max == -1)
3622     *code++ = OP_CRSTAR + repeat_type;
3623     else if (repeat_min == 1 && repeat_max == -1)
3624     *code++ = OP_CRPLUS + repeat_type;
3625     else if (repeat_min == 0 && repeat_max == 1)
3626     *code++ = OP_CRQUERY + repeat_type;
3627     else
3628     {
3629     *code++ = OP_CRRANGE + repeat_type;
3630     PUT2INC(code, 0, repeat_min);
3631     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3632     PUT2INC(code, 0, repeat_max);
3633     }
3634     }
3635    
3636     /* If previous was a bracket group, we may have to replicate it in certain
3637     cases. */
3638    
3639 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3640     *previous == OP_ONCE || *previous == OP_COND)
3641 nigel 77 {
3642     register int i;
3643     int ketoffset = 0;
3644     int len = code - previous;
3645     uschar *bralink = NULL;
3646    
3647 nigel 93 /* Repeating a DEFINE group is pointless */
3648    
3649     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3650     {
3651     *errorcodeptr = ERR55;
3652     goto FAILED;
3653     }
3654    
3655 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3656     by scanning through from the start, and compute the offset back to it
3657     from the current code pointer. There may be an OP_OPT setting following
3658     the final KET, so we can't find the end just by going back from the code
3659     pointer. */
3660    
3661     if (repeat_max == -1)
3662     {
3663     register uschar *ket = previous;
3664     do ket += GET(ket, 1); while (*ket != OP_KET);
3665     ketoffset = code - ket;
3666     }
3667    
3668     /* The case of a zero minimum is special because of the need to stick
3669     OP_BRAZERO in front of it, and because the group appears once in the
3670     data, whereas in other cases it appears the minimum number of times. For
3671     this reason, it is simplest to treat this case separately, as otherwise
3672     the code gets far too messy. There are several special subcases when the
3673     minimum is zero. */
3674    
3675     if (repeat_min == 0)
3676     {
3677     /* If the maximum is also zero, we just omit the group from the output
3678     altogether. */
3679    
3680     if (repeat_max == 0)
3681     {
3682     code = previous;
3683     goto END_REPEAT;
3684     }
3685    
3686     /* If the maximum is 1 or unlimited, we just have to stick in the
3687     BRAZERO and do no more at this point. However, we do need to adjust
3688     any OP_RECURSE calls inside the group that refer to the group itself or
3689 nigel 93 any internal or forward referenced group, because the offset is from
3690     the start of the whole regex. Temporarily terminate the pattern while
3691     doing this. */
3692 nigel 77
3693     if (repeat_max <= 1)
3694     {
3695     *code = OP_END;
3696 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3697 nigel 77 memmove(previous+1, previous, len);
3698     code++;
3699     *previous++ = OP_BRAZERO + repeat_type;
3700     }
3701    
3702     /* If the maximum is greater than 1 and limited, we have to replicate
3703     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3704     The first one has to be handled carefully because it's the original
3705     copy, which has to be moved up. The remainder can be handled by code
3706     that is common with the non-zero minimum case below. We have to
3707     adjust the value or repeat_max, since one less copy is required. Once
3708     again, we may have to adjust any OP_RECURSE calls inside the group. */
3709    
3710     else
3711     {
3712     int offset;
3713     *code = OP_END;
3714 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3715 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3716     code += 2 + LINK_SIZE;
3717     *previous++ = OP_BRAZERO + repeat_type;
3718     *previous++ = OP_BRA;
3719    
3720     /* We chain together the bracket offset fields that have to be
3721     filled in later when the ends of the brackets are reached. */
3722    
3723     offset = (bralink == NULL)? 0 : previous - bralink;
3724     bralink = previous;
3725     PUTINC(previous, 0, offset);
3726     }
3727    
3728     repeat_max--;
3729     }
3730    
3731     /* If the minimum is greater than zero, replicate the group as many
3732     times as necessary, and adjust the maximum to the number of subsequent
3733     copies that we need. If we set a first char from the group, and didn't
3734 nigel 93 set a required char, copy the latter from the former. If there are any
3735     forward reference subroutine calls in the group, there will be entries on
3736     the workspace list; replicate these with an appropriate increment. */
3737 nigel 77
3738     else
3739     {
3740     if (repeat_min > 1)
3741     {
3742 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3743 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3744     potential integer overflow. */
3745 nigel 93
3746     if (lengthptr != NULL)
3747 ph10 202 {
3748     int delta = (repeat_min - 1)*length_prevgroup;
3749     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3750     (double)INT_MAX ||
3751     OFLOW_MAX - *lengthptr < delta)
3752     {
3753     *errorcodeptr = ERR20;
3754     goto FAILED;
3755     }
3756     *lengthptr += delta;
3757     }
3758 nigel 93
3759     /* This is compiling for real */
3760    
3761     else
3762 nigel 77 {
3763 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3764     for (i = 1; i < repeat_min; i++)
3765     {
3766     uschar *hc;
3767     uschar *this_hwm = cd->hwm;
3768     memcpy(code, previous, len);
3769     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3770     {
3771     PUT(cd->hwm, 0, GET(hc, 0) + len);
3772     cd->hwm += LINK_SIZE;
3773     }
3774     save_hwm = this_hwm;
3775     code += len;
3776     }
3777 nigel 77 }
3778     }
3779 nigel 93
3780 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3781     }
3782    
3783     /* This code is common to both the zero and non-zero minimum cases. If
3784     the maximum is limited, it replicates the group in a nested fashion,
3785     remembering the bracket starts on a stack. In the case of a zero minimum,
3786     the first one was set up above. In all cases the repeat_max now specifies
3787 nigel 93 the number of additional copies needed. Again, we must remember to
3788     replicate entries on the forward reference list. */
3789 nigel 77
3790     if (repeat_max >= 0)
3791     {
3792 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3793     just adjust the length as if we had. For each repetition we must add 1
3794     to the length for BRAZERO and for all but the last repetition we must
3795 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3796     paranoid checks to avoid integer overflow. */
3797 nigel 93
3798     if (lengthptr != NULL && repeat_max > 0)
3799 ph10 202 {
3800     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3801     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3802     if ((double)repeat_max *
3803     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3804     > (double)INT_MAX ||
3805     OFLOW_MAX - *lengthptr < delta)
3806     {
3807     *errorcodeptr = ERR20;
3808     goto FAILED;
3809     }
3810     *lengthptr += delta;
3811     }
3812 nigel 93
3813     /* This is compiling for real */
3814    
3815     else for (i = repeat_max - 1; i >= 0; i--)
3816 nigel 77 {
3817 nigel 93 uschar *hc;
3818     uschar *this_hwm = cd->hwm;
3819    
3820 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3821    
3822     /* All but the final copy start a new nesting, maintaining the
3823     chain of brackets outstanding. */
3824    
3825     if (i != 0)
3826     {
3827     int offset;
3828     *code++ = OP_BRA;
3829     offset = (bralink == NULL)? 0 : code - bralink;
3830     bralink = code;
3831     PUTINC(code, 0, offset);
3832     }
3833    
3834     memcpy(code, previous, len);
3835 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3836     {
3837     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3838     cd->hwm += LINK_SIZE;
3839     }
3840     save_hwm = this_hwm;
3841 nigel 77 code += len;
3842     }
3843    
3844     /* Now chain through the pending brackets, and fill in their length
3845     fields (which are holding the chain links pro tem). */
3846    
3847     while (bralink != NULL)
3848     {
3849     int oldlinkoffset;
3850     int offset = code - bralink + 1;
3851     uschar *bra = code - offset;
3852     oldlinkoffset = GET(bra, 1);
3853     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3854     *code++ = OP_KET;
3855     PUTINC(code, 0, offset);
3856     PUT(bra, 1, offset);
3857     }
3858     }
3859    
3860     /* If the maximum is unlimited, set a repeater in the final copy. We
3861     can't just offset backwards from the current code point, because we
3862     don't know if there's been an options resetting after the ket. The
3863 nigel 93 correct offset was computed above.
3864 nigel 77
3865 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3866     this group is a non-atomic one that could match an empty string. If so,
3867     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3868     that runtime checking can be done. [This check is also applied to
3869     atomic groups at runtime, but in a different way.] */
3870    
3871     else
3872     {
3873     uschar *ketcode = code - ketoffset;
3874     uschar *bracode = ketcode - GET(ketcode, 1);
3875     *ketcode = OP_KETRMAX + repeat_type;
3876     if (lengthptr == NULL && *bracode != OP_ONCE)
3877     {
3878     uschar *scode = bracode;
3879     do
3880     {
3881     if (could_be_empty_branch(scode, ketcode, utf8))
3882     {
3883     *bracode += OP_SBRA - OP_BRA;
3884     break;
3885     }
3886     scode += GET(scode, 1);
3887     }
3888     while (*scode == OP_ALT);
3889     }
3890     }
3891 nigel 77 }
3892    
3893     /* Else there's some kind of shambles */
3894    
3895     else
3896     {
3897     *errorcodeptr = ERR11;
3898     goto FAILED;
3899     }
3900    
3901 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3902     tests above succeeded, possessive_quantifier is TRUE. For some of the
3903     simpler opcodes, there is an special alternative opcode for this. For
3904     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3905     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3906     but the special opcodes can optimize it a bit. The repeated item starts at
3907     tempcode, not at previous, which might be the first part of a string whose
3908     (former) last char we repeated.
3909 nigel 77
3910 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3911     an 'upto' may follow. We skip over an 'exact' item, and then test the
3912     length of what remains before proceeding. */
3913    
3914 nigel 77 if (possessive_quantifier)
3915     {
3916 nigel 93 int len;
3917     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3918     *tempcode == OP_NOTEXACT)
3919     tempcode += _pcre_OP_lengths[*tempcode];
3920     len = code - tempcode;
3921     if (len > 0) switch (*tempcode)
3922     {
3923     case OP_STAR: *tempcode = OP_POSSTAR; break;
3924     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3925     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3926     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3927    
3928     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3929     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3930     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3931     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3932    
3933     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3934     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3935     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3936     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3937    
3938     default:
3939     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3940     code += 1 + LINK_SIZE;
3941     len += 1 + LINK_SIZE;
3942     tempcode[0] = OP_ONCE;
3943     *code++ = OP_KET;
3944     PUTINC(code, 0, len);
3945     PUT(tempcode, 1, len);
3946     break;
3947     }
3948 nigel 77 }
3949    
3950     /* In all case we no longer have a previous item. We also set the
3951     "follows varying string" flag for subsequently encountered reqbytes if
3952     it isn't already set and we have just passed a varying length item. */
3953    
3954     END_REPEAT:
3955     previous = NULL;
3956     cd->req_varyopt |= reqvary;
3957     break;
3958    
3959    
3960 nigel 93 /* ===================================================================*/
3961     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3962     lookbehind or option setting or condition or all the other extended
3963 ph10 210 parenthesis forms. */
3964 nigel 77
3965     case '(':
3966     newoptions = options;
3967     skipbytes = 0;
3968 nigel 93 bravalue = OP_CBRA;
3969     save_hwm = cd->hwm;
3970 ph10 180 reset_bracount = FALSE;
3971 ph10 211
3972 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3973 ph10 211
3974 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3975     {
3976 ph10 211 int i, namelen;
3977 ph10 210 const uschar *name = ++ptr;
3978     previous = NULL;
3979     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3980     if (*ptr == ':')
3981     {
3982     *errorcodeptr = ERR59; /* Not supported */
3983 ph10 211 goto FAILED;
3984     }
3985 ph10 210 if (*ptr != ')')
3986     {
3987     *errorcodeptr = ERR60;
3988     goto FAILED;
3989     }
3990 ph10 211 namelen = ptr - name;
3991 ph10 210 for (i = 0; i < verbcount; i++)
3992 ph10 211 {
3993 ph10 210 if (namelen == verbs[i].len &&
3994     strncmp((char *)name, verbs[i].name, namelen) == 0)
3995     {
3996     *code = verbs[i].op;
3997     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3998     break;
3999 ph10 211 }
4000     }
4001     if (i < verbcount) continue;
4002 ph10 210 *errorcodeptr = ERR60;
4003 ph10 211 goto FAILED;
4004     }
4005    
4006 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
4007     appearance of any of them means that this is not a capturing group. */
4008 nigel 77
4009 ph10 210 else if (*ptr == '?')
4010 nigel 77 {
4011 nigel 93 int i, set, unset, namelen;
4012 nigel 77 int *optset;
4013 nigel 93 const uschar *name;
4014     uschar *slot;
4015 nigel 77
4016     switch (*(++ptr))
4017     {
4018     case '#': /* Comment; skip to ket */
4019     ptr++;
4020 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
4021     if (*ptr == 0)
4022     {
4023     *errorcodeptr = ERR18;
4024     goto FAILED;
4025     }
4026 nigel 77 continue;
4027    
4028 nigel 93
4029     /* ------------------------------------------------------------ */
4030 ph10 175 case '|': /* Reset capture count for each branch */
4031     reset_bracount = TRUE;
4032 ph10 180 /* Fall through */
4033 ph10 175
4034     /* ------------------------------------------------------------ */
4035 nigel 93 case ':': /* Non-capturing bracket */
4036 nigel 77 bravalue = OP_BRA;
4037     ptr++;
4038     break;
4039    
4040 nigel 93
4041     /* ------------------------------------------------------------ */
4042 nigel 77 case '(':
4043     bravalue = OP_COND; /* Conditional group */
4044    
4045 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
4046     group), a name (referring to a named group), or 'R', referring to
4047     recursion. R<digits> and R&name are also permitted for recursion tests.
4048 nigel 77
4049 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
4050     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4051    
4052     There are two unfortunate ambiguities, caused by history. (a) 'R' can
4053     be the recursive thing or the name 'R' (and similarly for 'R' followed
4054     by digits), and (b) a number could be a name that consists of digits.
4055     In both cases, we look for a name first; if not found, we try the other
4056     cases. */
4057    
4058     /* For conditions that are assertions, check the syntax, and then exit
4059     the switch. This will take control down to where bracketed groups,
4060     including assertions, are processed. */
4061    
4062     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4063     break;
4064    
4065     /* Most other conditions use OP_CREF (a couple change to OP_RREF
4066     below), and all need to skip 3 bytes at the start of the group. */
4067    
4068     code[1+LINK_SIZE] = OP_CREF;
4069     skipbytes = 3;
4070 ph10 172 refsign = -1;
4071 nigel 93
4072     /* Check for a test for recursion in a named group. */
4073    
4074     if (ptr[1] == 'R' && ptr[2] == '&')
4075 nigel 77 {
4076 nigel 93 terminator = -1;
4077     ptr += 2;
4078     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4079     }
4080 nigel 91
4081 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4082     syntax (?(<name>) or (?('name') */
4083 nigel 91
4084 nigel 93 else if (ptr[1] == '<')
4085     {
4086     terminator = '>';
4087     ptr++;
4088     }
4089     else if (ptr[1] == '\'')
4090     {
4091     terminator = '\'';
4092     ptr++;
4093     }
4094 ph10 172 else
4095 ph10 167 {
4096     terminator = 0;
4097 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4098     }
4099 nigel 77
4100 nigel 93 /* We now expect to read a name; any thing else is an error */
4101 nigel 77
4102 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4103     {
4104     ptr += 1; /* To get the right offset */
4105     *errorcodeptr = ERR28;
4106     goto FAILED;
4107     }
4108    
4109     /* Read the name, but also get it as a number if it's all digits */
4110    
4111     recno = 0;
4112     name = ++ptr;
4113     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4114     {
4115     if (recno >= 0)
4116     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4117     recno * 10 + *ptr - '0' : -1;
4118 nigel 91 ptr++;
4119 nigel 93 }
4120     namelen = ptr - name;
4121 nigel 91
4122 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4123