/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 205 - (hide annotations) (download)
Fri Aug 3 13:18:33 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 185584 byte(s)
Fix bugs with [\E] and [\Q\E].

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143     /* Tables of names of POSIX character classes and their lengths. The list is
144 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
145 nigel 77 as this is assumed for handling case independence. */
146    
147     static const char *const posix_names[] = {
148     "alpha", "lower", "upper",
149     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
150     "print", "punct", "space", "word", "xdigit" };
151    
152     static const uschar posix_name_lengths[] = {
153     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154    
155 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
156     base map, with an optional addition or removal of another map. Then, for some
157     classes, there is some additional tweaking: for [:blank:] the vertical space
158     characters are removed, and for [:alpha:] and [:alnum:] the underscore
159     character is removed. The triples in the table consist of the base map offset,
160     second map offset or -1 if no second map, and a non-negative value for map
161     addition or a negative value for map subtraction (if there are two maps). The
162     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163     remove vertical space characters, 2 => remove underscore. */
164 nigel 77
165     static const int posix_class_maps[] = {
166 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
167     cbit_lower, -1, 0, /* lower */
168     cbit_upper, -1, 0, /* upper */
169     cbit_word, -1, 2, /* alnum - word without underscore */
170     cbit_print, cbit_cntrl, 0, /* ascii */
171     cbit_space, -1, 1, /* blank - a GNU extension */
172     cbit_cntrl, -1, 0, /* cntrl */
173     cbit_digit, -1, 0, /* digit */
174     cbit_graph, -1, 0, /* graph */
175     cbit_print, -1, 0, /* print */
176     cbit_punct, -1, 0, /* punct */
177     cbit_space, -1, 0, /* space */
178     cbit_word, -1, 0, /* word - a Perl extension */
179     cbit_xdigit,-1, 0 /* xdigit */
180 nigel 77 };
181    
182    
183 nigel 93 #define STRING(a) # a
184     #define XSTRING(s) STRING(s)
185    
186 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
187 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
188     they are documented. Always add a new error instead. Messages marked DEAD below
189     are no longer used. */
190 nigel 77
191     static const char *error_texts[] = {
192     "no error",
193     "\\ at end of pattern",
194     "\\c at end of pattern",
195     "unrecognized character follows \\",
196     "numbers out of order in {} quantifier",
197     /* 5 */
198     "number too big in {} quantifier",
199     "missing terminating ] for character class",
200     "invalid escape sequence in character class",
201     "range out of order in character class",
202     "nothing to repeat",
203     /* 10 */
204 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
205 nigel 77 "internal error: unexpected repeat",
206     "unrecognized character after (?",
207     "POSIX named classes are supported only within a class",
208     "missing )",
209     /* 15 */
210     "reference to non-existent subpattern",
211     "erroffset passed as NULL",
212     "unknown option bit(s) set",
213     "missing ) after comment",
214 nigel 93 "parentheses nested too deeply", /** DEAD **/
215 nigel 77 /* 20 */
216 ph10 202 "regular expression is too large",
217 nigel 77 "failed to get memory",
218     "unmatched parentheses",
219     "internal error: code overflow",
220     "unrecognized character after (?<",
221     /* 25 */
222     "lookbehind assertion is not fixed length",
223 nigel 91 "malformed number or name after (?(",
224 nigel 77 "conditional group contains more than two branches",
225     "assertion expected after (?(",
226 ph10 166 "(?R or (?[+-]digits must be followed by )",
227 nigel 77 /* 30 */
228     "unknown POSIX class name",
229     "POSIX collating elements are not supported",
230     "this version of PCRE is not compiled with PCRE_UTF8 support",
231 nigel 93 "spare error", /** DEAD **/
232 nigel 77 "character value in \\x{...} sequence is too large",
233     /* 35 */
234     "invalid condition (?(0)",
235     "\\C not allowed in lookbehind assertion",
236     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
237     "number after (?C is > 255",
238     "closing ) for (?C expected",
239     /* 40 */
240     "recursive call could loop indefinitely",
241     "unrecognized character after (?P",
242 nigel 93 "syntax error in subpattern name (missing terminator)",
243 nigel 91 "two named subpatterns have the same name",
244 nigel 77 "invalid UTF-8 string",
245     /* 45 */
246     "support for \\P, \\p, and \\X has not been compiled",
247     "malformed \\P or \\p sequence",
248 nigel 91 "unknown property name after \\P or \\p",
249 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251 nigel 91 /* 50 */
252 ph10 202 "repeated subpattern is too long", /** DEAD **/
253 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
254     "internal error: overran compiling workspace",
255     "internal error: previously-checked referenced subpattern not found",
256     "DEFINE group contains more than one branch",
257     /* 55 */
258     "repeating a DEFINE group is not allowed",
259     "inconsistent NEWLINE options",
260 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
261 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262 nigel 77 };
263    
264    
265     /* Table to identify digits and hex digits. This is used when compiling
266     patterns. Note that the tables in chartables are dependent on the locale, and
267     may mark arbitrary characters as digits - but the PCRE compiling code expects
268     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
269     a private table here. It costs 256 bytes, but it is a lot faster than doing
270     character value tests (at least in some simple cases I timed), and in some
271     applications one wants PCRE to compile efficiently as well as match
272     efficiently.
273    
274     For convenience, we use the same bit definitions as in chartables:
275    
276     0x04 decimal digit
277     0x08 hexadecimal digit
278    
279     Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
282 nigel 77 static const unsigned char digitab[] =
283     {
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
290     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
291     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
292     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
296     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
318 nigel 77 static const unsigned char digitab[] =
319     {
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
331 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
332 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
336     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
344     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
350     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
351     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
352    
353     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
354     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
355     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
356     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
358     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
361     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
362     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
363     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
365 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
366 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
367     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
370     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
371     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
372     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
373     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
374     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
375     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
376     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
377     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
378     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
379     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
380     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
381     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
382     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
383     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
384     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
385     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
386     #endif
387    
388    
389     /* Definition to allow mutual recursion */
390    
391     static BOOL
392 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
394 nigel 77
395    
396    
397     /*************************************************
398     * Handle escapes *
399     *************************************************/
400    
401     /* This function is called when a \ has been encountered. It either returns a
402     positive value for a simple escape such as \n, or a negative value which
403 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
404     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406     ptr is pointing at the \. On exit, it is on the final character of the escape
407     sequence.
408 nigel 77
409     Arguments:
410     ptrptr points to the pattern position pointer
411     errorcodeptr points to the errorcode variable
412     bracount number of previous extracting brackets
413     options the options bits
414     isclass TRUE if inside a character class
415    
416     Returns: zero or positive => a data character
417     negative => a special escape sequence
418     on error, errorptr is set
419     */
420    
421     static int
422     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423     int options, BOOL isclass)
424     {
425 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
426     const uschar *ptr = *ptrptr + 1;
427 nigel 77 int c, i;
428    
429 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
430     ptr--; /* Set pointer back to the last byte */
431    
432 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
433    
434     if (c == 0) *errorcodeptr = ERR1;
435    
436     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437     a table. A non-zero result is something that can be returned immediately.
438     Otherwise further processing may be required. */
439    
440 ph10 97 #ifndef EBCDIC /* ASCII coding */
441 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
442     else if ((i = escapes[c - '0']) != 0) c = i;
443    
444 ph10 97 #else /* EBCDIC coding */
445 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
446     else if ((i = escapes[c - 0x48]) != 0) c = i;
447     #endif
448    
449     /* Escapes that need further processing, or are illegal. */
450    
451     else
452     {
453     const uschar *oldptr;
454 nigel 93 BOOL braced, negated;
455    
456 nigel 77 switch (c)
457     {
458     /* A number of Perl escapes are not handled by PCRE. We give an explicit
459     error. */
460    
461     case 'l':
462     case 'L':
463     case 'N':
464     case 'u':
465     case 'U':
466     *errorcodeptr = ERR37;
467     break;
468    
469 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
470     is an absolute backreference. If negative, it is a relative backreference.
471 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472     reference to a named group. This is part of Perl's movement towards a
473     unified syntax for back references. As this is synonymous with \k{name}, we
474 ph10 171 fudge it up by pretending it really was \k. */
475 nigel 93
476     case 'g':
477     if (ptr[1] == '{')
478     {
479 ph10 171 const uschar *p;
480     for (p = ptr+2; *p != 0 && *p != '}'; p++)
481     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482 ph10 172 if (*p != 0 && *p != '}')
483 ph10 171 {
484     c = -ESC_k;
485     break;
486 ph10 172 }
487 nigel 93 braced = TRUE;
488     ptr++;
489     }
490     else braced = FALSE;
491    
492     if (ptr[1] == '-')
493     {
494     negated = TRUE;
495     ptr++;
496     }
497     else negated = FALSE;
498    
499     c = 0;
500     while ((digitab[ptr[1]] & ctype_digit) != 0)
501     c = c * 10 + *(++ptr) - '0';
502    
503     if (c == 0 || (braced && *(++ptr) != '}'))
504     {
505     *errorcodeptr = ERR57;
506     return 0;
507     }
508    
509     if (negated)
510     {
511     if (c > bracount)
512     {
513     *errorcodeptr = ERR15;
514     return 0;
515     }
516     c = bracount - (c - 1);
517     }
518    
519     c = -(ESC_REF + c);
520     break;
521    
522 nigel 77 /* The handling of escape sequences consisting of a string of digits
523     starting with one that is not zero is not straightforward. By experiment,
524     the way Perl works seems to be as follows:
525    
526     Outside a character class, the digits are read as a decimal number. If the
527     number is less than 10, or if there are that many previous extracting
528     left brackets, then it is a back reference. Otherwise, up to three octal
529     digits are read to form an escaped byte. Thus \123 is likely to be octal
530     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
531     value is greater than 377, the least significant 8 bits are taken. Inside a
532     character class, \ followed by a digit is always an octal number. */
533    
534     case '1': case '2': case '3': case '4': case '5':
535     case '6': case '7': case '8': case '9':
536    
537     if (!isclass)
538     {
539     oldptr = ptr;
540     c -= '0';
541     while ((digitab[ptr[1]] & ctype_digit) != 0)
542     c = c * 10 + *(++ptr) - '0';
543     if (c < 10 || c <= bracount)
544     {
545     c = -(ESC_REF + c);
546     break;
547     }
548     ptr = oldptr; /* Put the pointer back and fall through */
549     }
550    
551     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
552     generates a binary zero byte and treats the digit as a following literal.
553     Thus we have to pull back the pointer by one. */
554    
555     if ((c = *ptr) >= '8')
556     {
557     ptr--;
558     c = 0;
559     break;
560     }
561    
562     /* \0 always starts an octal number, but we may drop through to here with a
563 nigel 91 larger first octal digit. The original code used just to take the least
564     significant 8 bits of octal numbers (I think this is what early Perls used
565     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566     than 3 octal digits. */
567 nigel 77
568     case '0':
569     c -= '0';
570     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571     c = c * 8 + *(++ptr) - '0';
572 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
573 nigel 77 break;
574    
575 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
576     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577     treated as a data character. */
578 nigel 77
579     case 'x':
580 nigel 87 if (ptr[1] == '{')
581 nigel 77 {
582     const uschar *pt = ptr + 2;
583 nigel 87 int count = 0;
584    
585 nigel 77 c = 0;
586     while ((digitab[*pt] & ctype_xdigit) != 0)
587     {
588 nigel 87 register int cc = *pt++;
589     if (c == 0 && cc == '0') continue; /* Leading zeroes */
590 nigel 77 count++;
591 nigel 87
592 ph10 97 #ifndef EBCDIC /* ASCII coding */
593 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595 ph10 97 #else /* EBCDIC coding */
596 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
597 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598 nigel 77 #endif
599     }
600 nigel 87
601 nigel 77 if (*pt == '}')
602     {
603 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604 nigel 77 ptr = pt;
605     break;
606     }
607 nigel 87
608 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
609     recognize this construct; fall through to the normal \x handling. */
610     }
611    
612 nigel 87 /* Read just a single-byte hex-defined char */
613 nigel 77
614     c = 0;
615     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616     {
617     int cc; /* Some compilers don't like ++ */
618     cc = *(++ptr); /* in initializers */
619 ph10 97 #ifndef EBCDIC /* ASCII coding */
620 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
621     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622 ph10 97 #else /* EBCDIC coding */
623 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
624     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625     #endif
626     }
627     break;
628    
629 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630     This coding is ASCII-specific, but then the whole concept of \cx is
631     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632 nigel 77
633     case 'c':
634     c = *(++ptr);
635     if (c == 0)
636     {
637     *errorcodeptr = ERR2;
638     return 0;
639     }
640    
641 ph10 97 #ifndef EBCDIC /* ASCII coding */
642 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
643     c ^= 0x40;
644 ph10 97 #else /* EBCDIC coding */
645 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
646     c ^= 0xC0;
647     #endif
648     break;
649    
650     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
651     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
652     for Perl compatibility, it is a literal. This code looks a bit odd, but
653     there used to be some cases other than the default, and there may be again
654     in future, so I haven't "optimized" it. */
655    
656     default:
657     if ((options & PCRE_EXTRA) != 0) switch(c)
658     {
659     default:
660     *errorcodeptr = ERR3;
661     break;
662     }
663     break;
664     }
665     }
666    
667     *ptrptr = ptr;
668     return c;
669     }
670    
671    
672    
673     #ifdef SUPPORT_UCP
674     /*************************************************
675     * Handle \P and \p *
676     *************************************************/
677    
678     /* This function is called after \P or \p has been encountered, provided that
679     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
680     pointing at the P or p. On exit, it is pointing at the final character of the
681     escape sequence.
682    
683     Argument:
684     ptrptr points to the pattern position pointer
685     negptr points to a boolean that is set TRUE for negation else FALSE
686 nigel 87 dptr points to an int that is set to the detailed property value
687 nigel 77 errorcodeptr points to the error code variable
688    
689 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
690 nigel 77 */
691    
692     static int
693 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694 nigel 77 {
695     int c, i, bot, top;
696     const uschar *ptr = *ptrptr;
697 nigel 87 char name[32];
698 nigel 77
699     c = *(++ptr);
700     if (c == 0) goto ERROR_RETURN;
701    
702     *negptr = FALSE;
703    
704 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705     negation. */
706 nigel 77
707     if (c == '{')
708     {
709     if (ptr[1] == '^')
710     {
711     *negptr = TRUE;
712     ptr++;
713     }
714 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
715 nigel 77 {
716     c = *(++ptr);
717     if (c == 0) goto ERROR_RETURN;
718     if (c == '}') break;
719     name[i] = c;
720     }
721 nigel 87 if (c !='}') goto ERROR_RETURN;
722 nigel 77 name[i] = 0;
723     }
724    
725     /* Otherwise there is just one following character */
726    
727     else
728     {
729     name[0] = c;
730     name[1] = 0;
731     }
732    
733     *ptrptr = ptr;
734    
735     /* Search for a recognized property name using binary chop */
736    
737     bot = 0;
738     top = _pcre_utt_size;
739    
740     while (bot < top)
741     {
742 nigel 87 i = (bot + top) >> 1;
743 nigel 77 c = strcmp(name, _pcre_utt[i].name);
744 nigel 87 if (c == 0)
745     {
746     *dptr = _pcre_utt[i].value;
747     return _pcre_utt[i].type;
748     }
749 nigel 77 if (c > 0) bot = i + 1; else top = i;
750     }
751    
752     *errorcodeptr = ERR47;
753     *ptrptr = ptr;
754     return -1;
755    
756     ERROR_RETURN:
757     *errorcodeptr = ERR46;
758     *ptrptr = ptr;
759     return -1;
760     }
761     #endif
762    
763    
764    
765    
766     /*************************************************
767     * Check for counted repeat *
768     *************************************************/
769    
770     /* This function is called when a '{' is encountered in a place where it might
771     start a quantifier. It looks ahead to see if it really is a quantifier or not.
772     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
773     where the ddds are digits.
774    
775     Arguments:
776     p pointer to the first char after '{'
777    
778     Returns: TRUE or FALSE
779     */
780    
781     static BOOL
782     is_counted_repeat(const uschar *p)
783     {
784     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785     while ((digitab[*p] & ctype_digit) != 0) p++;
786     if (*p == '}') return TRUE;
787    
788     if (*p++ != ',') return FALSE;
789     if (*p == '}') return TRUE;
790    
791     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
792     while ((digitab[*p] & ctype_digit) != 0) p++;
793    
794     return (*p == '}');
795     }
796    
797    
798    
799     /*************************************************
800     * Read repeat counts *
801     *************************************************/
802    
803     /* Read an item of the form {n,m} and return the values. This is called only
804     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
805     so the syntax is guaranteed to be correct, but we need to check the values.
806    
807     Arguments:
808     p pointer to first char after '{'
809     minp pointer to int for min
810     maxp pointer to int for max
811     returned as -1 if no max
812     errorcodeptr points to error code variable
813    
814     Returns: pointer to '}' on success;
815     current ptr on error, with errorcodeptr set non-zero
816     */
817    
818     static const uschar *
819     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
820     {
821     int min = 0;
822     int max = -1;
823    
824 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
825     an integer overflow. */
826    
827 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
828 nigel 81 if (min < 0 || min > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77
834 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
835     Also, max must not be less than min. */
836    
837 nigel 77 if (*p == '}') max = min; else
838     {
839     if (*(++p) != '}')
840     {
841     max = 0;
842     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
843 nigel 81 if (max < 0 || max > 65535)
844     {
845     *errorcodeptr = ERR5;
846     return p;
847     }
848 nigel 77 if (max < min)
849     {
850     *errorcodeptr = ERR4;
851     return p;
852     }
853     }
854     }
855    
856 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
857     '}'. */
858 nigel 77
859 nigel 81 *minp = min;
860     *maxp = max;
861 nigel 77 return p;
862     }
863    
864    
865    
866     /*************************************************
867 nigel 93 * Find forward referenced subpattern *
868 nigel 91 *************************************************/
869    
870 nigel 93 /* This function scans along a pattern's text looking for capturing
871     subpatterns, and counting them. If it finds a named pattern that matches the
872     name it is given, it returns its number. Alternatively, if the name is NULL, it
873     returns when it reaches a given numbered subpattern. This is used for forward
874     references to subpatterns. We know that if (?P< is encountered, the name will
875     be terminated by '>' because that is checked in the first pass.
876 nigel 91
877     Arguments:
878 nigel 93 ptr current position in the pattern
879     count current count of capturing parens so far encountered
880     name name to seek, or NULL if seeking a numbered subpattern
881     lorn name length, or subpattern number if name is NULL
882     xmode TRUE if we are in /x mode
883 nigel 91
884     Returns: the number of the named subpattern, or -1 if not found
885     */
886    
887     static int
888 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889     BOOL xmode)
890 nigel 91 {
891     const uschar *thisname;
892 nigel 93
893 nigel 91 for (; *ptr != 0; ptr++)
894     {
895 nigel 93 int term;
896    
897     /* Skip over backslashed characters and also entire \Q...\E */
898    
899     if (*ptr == '\\')
900     {
901     if (*(++ptr) == 0) return -1;
902     if (*ptr == 'Q') for (;;)
903     {
904     while (*(++ptr) != 0 && *ptr != '\\');
905     if (*ptr == 0) return -1;
906     if (*(++ptr) == 'E') break;
907     }
908     continue;
909     }
910    
911     /* Skip over character classes */
912    
913     if (*ptr == '[')
914     {
915     while (*(++ptr) != ']')
916     {
917     if (*ptr == '\\')
918     {
919     if (*(++ptr) == 0) return -1;
920     if (*ptr == 'Q') for (;;)
921     {
922     while (*(++ptr) != 0 && *ptr != '\\');
923     if (*ptr == 0) return -1;
924     if (*(++ptr) == 'E') break;
925     }
926     continue;
927     }
928     }
929     continue;
930     }
931    
932     /* Skip comments in /x mode */
933    
934     if (xmode && *ptr == '#')
935     {
936     while (*(++ptr) != 0 && *ptr != '\n');
937     if (*ptr == 0) return -1;
938     continue;
939     }
940    
941     /* An opening parens must now be a real metacharacter */
942    
943 nigel 91 if (*ptr != '(') continue;
944 nigel 93 if (ptr[1] != '?')
945     {
946     count++;
947     if (name == NULL && count == lorn) return count;
948     continue;
949     }
950    
951     ptr += 2;
952     if (*ptr == 'P') ptr++; /* Allow optional P */
953    
954     /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957     *ptr != '\'')
958     continue;
959    
960 nigel 91 count++;
961 nigel 93
962     if (name == NULL && count == lorn) return count;
963     term = *ptr++;
964     if (term == '<') term = '>';
965 nigel 91 thisname = ptr;
966 nigel 93 while (*ptr != term) ptr++;
967     if (name != NULL && lorn == ptr - thisname &&
968     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969 nigel 91 return count;
970     }
971 nigel 93
972 nigel 91 return -1;
973     }
974    
975    
976    
977     /*************************************************
978 nigel 77 * Find first significant op code *
979     *************************************************/
980    
981     /* This is called by several functions that scan a compiled expression looking
982     for a fixed first character, or an anchoring op code etc. It skips over things
983     that do not influence this. For some calls, a change of option is important.
984     For some calls, it makes sense to skip negative forward and all backward
985     assertions, and also the \b assertion; for others it does not.
986    
987     Arguments:
988     code pointer to the start of the group
989     options pointer to external options
990     optbit the option bit whose changing is significant, or
991     zero if none are
992     skipassert TRUE if certain assertions are to be skipped
993    
994     Returns: pointer to the first significant opcode
995     */
996    
997     static const uschar*
998     first_significant_code(const uschar *code, int *options, int optbit,
999     BOOL skipassert)
1000     {
1001     for (;;)
1002     {
1003     switch ((int)*code)
1004     {
1005     case OP_OPT:
1006     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1007     *options = (int)code[1];
1008     code += 2;
1009     break;
1010    
1011     case OP_ASSERT_NOT:
1012     case OP_ASSERTBACK:
1013     case OP_ASSERTBACK_NOT:
1014     if (!skipassert) return code;
1015     do code += GET(code, 1); while (*code == OP_ALT);
1016     code += _pcre_OP_lengths[*code];
1017     break;
1018    
1019     case OP_WORD_BOUNDARY:
1020     case OP_NOT_WORD_BOUNDARY:
1021     if (!skipassert) return code;
1022     /* Fall through */
1023    
1024     case OP_CALLOUT:
1025     case OP_CREF:
1026 nigel 93 case OP_RREF:
1027     case OP_DEF:
1028 nigel 77 code += _pcre_OP_lengths[*code];
1029     break;
1030    
1031     default:
1032     return code;
1033     }
1034     }
1035     /* Control never reaches here */
1036     }
1037    
1038    
1039    
1040    
1041     /*************************************************
1042     * Find the fixed length of a pattern *
1043     *************************************************/
1044    
1045     /* Scan a pattern and compute the fixed length of subject that will match it,
1046     if the length is fixed. This is needed for dealing with backward assertions.
1047     In UTF8 mode, the result is in characters rather than bytes.
1048    
1049     Arguments:
1050     code points to the start of the pattern (the bracket)
1051     options the compiling options
1052    
1053     Returns: the fixed length, or -1 if there is no fixed length,
1054     or -2 if \C was encountered
1055     */
1056    
1057     static int
1058     find_fixedlength(uschar *code, int options)
1059     {
1060     int length = -1;
1061    
1062     register int branchlength = 0;
1063     register uschar *cc = code + 1 + LINK_SIZE;
1064    
1065     /* Scan along the opcodes for this branch. If we get to the end of the
1066     branch, check the length against that of the other branches. */
1067    
1068     for (;;)
1069     {
1070     int d;
1071     register int op = *cc;
1072    
1073     switch (op)
1074     {
1075 nigel 93 case OP_CBRA:
1076 nigel 77 case OP_BRA:
1077     case OP_ONCE:
1078     case OP_COND:
1079 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080 nigel 77 if (d < 0) return d;
1081     branchlength += d;
1082     do cc += GET(cc, 1); while (*cc == OP_ALT);
1083     cc += 1 + LINK_SIZE;
1084     break;
1085    
1086     /* Reached end of a branch; if it's a ket it is the end of a nested
1087     call. If it's ALT it is an alternation in a nested call. If it is
1088     END it's the end of the outer call. All can be handled by the same code. */
1089    
1090     case OP_ALT:
1091     case OP_KET:
1092     case OP_KETRMAX:
1093     case OP_KETRMIN:
1094     case OP_END:
1095     if (length < 0) length = branchlength;
1096     else if (length != branchlength) return -1;
1097     if (*cc != OP_ALT) return length;
1098     cc += 1 + LINK_SIZE;
1099     branchlength = 0;
1100     break;
1101    
1102     /* Skip over assertive subpatterns */
1103    
1104     case OP_ASSERT:
1105     case OP_ASSERT_NOT:
1106     case OP_ASSERTBACK:
1107     case OP_ASSERTBACK_NOT:
1108     do cc += GET(cc, 1); while (*cc == OP_ALT);
1109     /* Fall through */
1110    
1111     /* Skip over things that don't match chars */
1112    
1113     case OP_REVERSE:
1114     case OP_CREF:
1115 nigel 93 case OP_RREF:
1116     case OP_DEF:
1117 nigel 77 case OP_OPT:
1118     case OP_CALLOUT:
1119     case OP_SOD:
1120     case OP_SOM:
1121     case OP_EOD:
1122     case OP_EODN:
1123     case OP_CIRC:
1124     case OP_DOLL:
1125     case OP_NOT_WORD_BOUNDARY:
1126     case OP_WORD_BOUNDARY:
1127     cc += _pcre_OP_lengths[*cc];
1128     break;
1129    
1130     /* Handle literal characters */
1131    
1132     case OP_CHAR:
1133     case OP_CHARNC:
1134 nigel 91 case OP_NOT:
1135 nigel 77 branchlength++;
1136     cc += 2;
1137     #ifdef SUPPORT_UTF8
1138     if ((options & PCRE_UTF8) != 0)
1139     {
1140     while ((*cc & 0xc0) == 0x80) cc++;
1141     }
1142     #endif
1143     break;
1144    
1145     /* Handle exact repetitions. The count is already in characters, but we
1146     need to skip over a multibyte character in UTF8 mode. */
1147    
1148     case OP_EXACT:
1149     branchlength += GET2(cc,1);
1150     cc += 4;
1151     #ifdef SUPPORT_UTF8
1152     if ((options & PCRE_UTF8) != 0)
1153     {
1154     while((*cc & 0x80) == 0x80) cc++;
1155     }
1156     #endif
1157     break;
1158    
1159     case OP_TYPEEXACT:
1160     branchlength += GET2(cc,1);
1161     cc += 4;
1162     break;
1163    
1164     /* Handle single-char matchers */
1165    
1166     case OP_PROP:
1167     case OP_NOTPROP:
1168 nigel 87 cc += 2;
1169 nigel 77 /* Fall through */
1170    
1171     case OP_NOT_DIGIT:
1172     case OP_DIGIT:
1173     case OP_NOT_WHITESPACE:
1174     case OP_WHITESPACE:
1175     case OP_NOT_WORDCHAR:
1176     case OP_WORDCHAR:
1177     case OP_ANY:
1178     branchlength++;
1179     cc++;
1180     break;
1181    
1182     /* The single-byte matcher isn't allowed */
1183    
1184     case OP_ANYBYTE:
1185     return -2;
1186    
1187     /* Check a class for variable quantification */
1188    
1189     #ifdef SUPPORT_UTF8
1190     case OP_XCLASS:
1191     cc += GET(cc, 1) - 33;
1192     /* Fall through */
1193     #endif
1194    
1195     case OP_CLASS:
1196     case OP_NCLASS:
1197     cc += 33;
1198    
1199     switch (*cc)
1200     {
1201     case OP_CRSTAR:
1202     case OP_CRMINSTAR:
1203     case OP_CRQUERY:
1204     case OP_CRMINQUERY:
1205     return -1;
1206    
1207     case OP_CRRANGE:
1208     case OP_CRMINRANGE:
1209     if (GET2(cc,1) != GET2(cc,3)) return -1;
1210     branchlength += GET2(cc,1);
1211     cc += 5;
1212     break;
1213    
1214     default:
1215     branchlength++;
1216     }
1217     break;
1218    
1219     /* Anything else is variable length */
1220    
1221     default:
1222     return -1;
1223     }
1224     }
1225     /* Control never gets here */
1226     }
1227    
1228    
1229    
1230    
1231     /*************************************************
1232     * Scan compiled regex for numbered bracket *
1233     *************************************************/
1234    
1235     /* This little function scans through a compiled pattern until it finds a
1236     capturing bracket with the given number.
1237    
1238     Arguments:
1239     code points to start of expression
1240     utf8 TRUE in UTF-8 mode
1241     number the required bracket number
1242    
1243     Returns: pointer to the opcode for the bracket, or NULL if not found
1244     */
1245    
1246     static const uschar *
1247     find_bracket(const uschar *code, BOOL utf8, int number)
1248     {
1249     for (;;)
1250     {
1251     register int c = *code;
1252     if (c == OP_END) return NULL;
1253 nigel 91
1254     /* XCLASS is used for classes that cannot be represented just by a bit
1255     map. This includes negated single high-valued characters. The length in
1256     the table is zero; the actual length is stored in the compiled code. */
1257    
1258     if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260 nigel 93 /* Handle capturing bracket */
1261 nigel 91
1262 nigel 93 else if (c == OP_CBRA)
1263 nigel 77 {
1264 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1265 nigel 77 if (n == number) return (uschar *)code;
1266 nigel 93 code += _pcre_OP_lengths[c];
1267 nigel 77 }
1268 nigel 91
1269 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270     a multi-byte character. The length in the table is a minimum, so we have to
1271     arrange to skip the extra bytes. */
1272 nigel 91
1273 nigel 77 else
1274     {
1275     code += _pcre_OP_lengths[c];
1276 ph10 107 #ifdef SUPPORT_UTF8
1277 nigel 77 if (utf8) switch(c)
1278     {
1279     case OP_CHAR:
1280     case OP_CHARNC:
1281     case OP_EXACT:
1282     case OP_UPTO:
1283     case OP_MINUPTO:
1284 nigel 93 case OP_POSUPTO:
1285 nigel 77 case OP_STAR:
1286     case OP_MINSTAR:
1287 nigel 93 case OP_POSSTAR:
1288 nigel 77 case OP_PLUS:
1289     case OP_MINPLUS:
1290 nigel 93 case OP_POSPLUS:
1291 nigel 77 case OP_QUERY:
1292     case OP_MINQUERY:
1293 nigel 93 case OP_POSQUERY:
1294     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1295 nigel 77 break;
1296     }
1297 ph10 111 #endif
1298 nigel 77 }
1299     }
1300     }
1301    
1302    
1303    
1304     /*************************************************
1305     * Scan compiled regex for recursion reference *
1306     *************************************************/
1307    
1308     /* This little function scans through a compiled pattern until it finds an
1309     instance of OP_RECURSE.
1310    
1311     Arguments:
1312     code points to start of expression
1313     utf8 TRUE in UTF-8 mode
1314    
1315     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1316     */
1317    
1318     static const uschar *
1319     find_recurse(const uschar *code, BOOL utf8)
1320     {
1321     for (;;)
1322     {
1323     register int c = *code;
1324     if (c == OP_END) return NULL;
1325 nigel 91 if (c == OP_RECURSE) return code;
1326    
1327     /* XCLASS is used for classes that cannot be represented just by a bit
1328     map. This includes negated single high-valued characters. The length in
1329     the table is zero; the actual length is stored in the compiled code. */
1330    
1331     if (c == OP_XCLASS) code += GET(code, 1);
1332    
1333     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334     that are followed by a character may be followed by a multi-byte character.
1335 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1336     bytes. */
1337 nigel 91
1338 nigel 77 else
1339     {
1340     code += _pcre_OP_lengths[c];
1341 ph10 107 #ifdef SUPPORT_UTF8
1342 nigel 77 if (utf8) switch(c)
1343     {
1344     case OP_CHAR:
1345     case OP_CHARNC:
1346     case OP_EXACT:
1347     case OP_UPTO:
1348     case OP_MINUPTO:
1349 nigel 93 case OP_POSUPTO:
1350 nigel 77 case OP_STAR:
1351     case OP_MINSTAR:
1352 nigel 93 case OP_POSSTAR:
1353 nigel 77 case OP_PLUS:
1354     case OP_MINPLUS:
1355 nigel 93 case OP_POSPLUS:
1356 nigel 77 case OP_QUERY:
1357     case OP_MINQUERY:
1358 nigel 93 case OP_POSQUERY:
1359     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1360 nigel 77 break;
1361     }
1362 ph10 111 #endif
1363 nigel 77 }
1364     }
1365     }
1366    
1367    
1368    
1369     /*************************************************
1370     * Scan compiled branch for non-emptiness *
1371     *************************************************/
1372    
1373     /* This function scans through a branch of a compiled pattern to see whether it
1374 nigel 93 can match the empty string or not. It is called from could_be_empty()
1375     below and from compile_branch() when checking for an unlimited repeat of a
1376     group that can match nothing. Note that first_significant_code() skips over
1377     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378     struck an inner bracket whose current branch will already have been scanned.
1379 nigel 77
1380     Arguments:
1381     code points to start of search
1382     endcode points to where to stop
1383     utf8 TRUE if in UTF8 mode
1384    
1385     Returns: TRUE if what is matched could be empty
1386     */
1387    
1388     static BOOL
1389     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390     {
1391     register int c;
1392 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393 nigel 77 code < endcode;
1394     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395     {
1396     const uschar *ccode;
1397    
1398     c = *code;
1399 ph10 172
1400 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1401 nigel 77
1402 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403     {
1404 ph10 172 code += _pcre_OP_lengths[c];
1405 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1406     c = *code;
1407     continue;
1408     }
1409    
1410     /* For other groups, scan the branches. */
1411 ph10 172
1412 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1413 nigel 77 {
1414     BOOL empty_branch;
1415     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1416    
1417     /* Scan a closed bracket */
1418    
1419     empty_branch = FALSE;
1420     do
1421     {
1422     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1423     empty_branch = TRUE;
1424     code += GET(code, 1);
1425     }
1426     while (*code == OP_ALT);
1427     if (!empty_branch) return FALSE; /* All branches are non-empty */
1428 ph10 172 c = *code;
1429 nigel 93 continue;
1430 nigel 77 }
1431    
1432 nigel 93 /* Handle the other opcodes */
1433    
1434     switch (c)
1435 nigel 77 {
1436     /* Check for quantifiers after a class */
1437    
1438     #ifdef SUPPORT_UTF8
1439     case OP_XCLASS:
1440     ccode = code + GET(code, 1);
1441     goto CHECK_CLASS_REPEAT;
1442     #endif
1443    
1444     case OP_CLASS:
1445     case OP_NCLASS:
1446     ccode = code + 33;
1447    
1448     #ifdef SUPPORT_UTF8
1449     CHECK_CLASS_REPEAT:
1450     #endif
1451    
1452     switch (*ccode)
1453     {
1454     case OP_CRSTAR: /* These could be empty; continue */
1455     case OP_CRMINSTAR:
1456     case OP_CRQUERY:
1457     case OP_CRMINQUERY:
1458     break;
1459    
1460     default: /* Non-repeat => class must match */
1461     case OP_CRPLUS: /* These repeats aren't empty */
1462     case OP_CRMINPLUS:
1463     return FALSE;
1464    
1465     case OP_CRRANGE:
1466     case OP_CRMINRANGE:
1467     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1468     break;
1469     }
1470     break;
1471    
1472     /* Opcodes that must match a character */
1473    
1474     case OP_PROP:
1475     case OP_NOTPROP:
1476     case OP_EXTUNI:
1477     case OP_NOT_DIGIT:
1478     case OP_DIGIT:
1479     case OP_NOT_WHITESPACE:
1480     case OP_WHITESPACE:
1481     case OP_NOT_WORDCHAR:
1482     case OP_WORDCHAR:
1483     case OP_ANY:
1484     case OP_ANYBYTE:
1485     case OP_CHAR:
1486     case OP_CHARNC:
1487     case OP_NOT:
1488     case OP_PLUS:
1489     case OP_MINPLUS:
1490 nigel 93 case OP_POSPLUS:
1491 nigel 77 case OP_EXACT:
1492     case OP_NOTPLUS:
1493     case OP_NOTMINPLUS:
1494 nigel 93 case OP_NOTPOSPLUS:
1495 nigel 77 case OP_NOTEXACT:
1496     case OP_TYPEPLUS:
1497     case OP_TYPEMINPLUS:
1498 nigel 93 case OP_TYPEPOSPLUS:
1499 nigel 77 case OP_TYPEEXACT:
1500     return FALSE;
1501    
1502     /* End of branch */
1503    
1504     case OP_KET:
1505     case OP_KETRMAX:
1506     case OP_KETRMIN:
1507     case OP_ALT:
1508     return TRUE;
1509    
1510 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511     MINUPTO, and POSUPTO may be followed by a multibyte character */
1512 nigel 77
1513     #ifdef SUPPORT_UTF8
1514     case OP_STAR:
1515     case OP_MINSTAR:
1516 nigel 93 case OP_POSSTAR:
1517 nigel 77 case OP_QUERY:
1518     case OP_MINQUERY:
1519 nigel 93 case OP_POSQUERY:
1520 nigel 77 case OP_UPTO:
1521     case OP_MINUPTO:
1522 nigel 93 case OP_POSUPTO:
1523 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524     break;
1525     #endif
1526     }
1527     }
1528    
1529     return TRUE;
1530     }
1531    
1532    
1533    
1534     /*************************************************
1535     * Scan compiled regex for non-emptiness *
1536     *************************************************/
1537    
1538     /* This function is called to check for left recursive calls. We want to check
1539     the current branch of the current pattern to see if it could match the empty
1540     string. If it could, we must look outwards for branches at other levels,
1541     stopping when we pass beyond the bracket which is the subject of the recursion.
1542    
1543     Arguments:
1544     code points to start of the recursion
1545     endcode points to where to stop (current RECURSE item)
1546     bcptr points to the chain of current (unclosed) branch starts
1547     utf8 TRUE if in UTF-8 mode
1548    
1549     Returns: TRUE if what is matched could be empty
1550     */
1551    
1552     static BOOL
1553     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1554     BOOL utf8)
1555     {
1556     while (bcptr != NULL && bcptr->current >= code)
1557     {
1558     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1559     bcptr = bcptr->outer;
1560     }
1561     return TRUE;
1562     }
1563    
1564    
1565    
1566     /*************************************************
1567     * Check for POSIX class syntax *
1568     *************************************************/
1569    
1570     /* This function is called when the sequence "[:" or "[." or "[=" is
1571     encountered in a character class. It checks whether this is followed by an
1572     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1573     ".]" or "=]".
1574    
1575     Argument:
1576     ptr pointer to the initial [
1577     endptr where to return the end pointer
1578     cd pointer to compile data
1579    
1580     Returns: TRUE or FALSE
1581     */
1582    
1583     static BOOL
1584     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1585     {
1586     int terminator; /* Don't combine these lines; the Solaris cc */
1587     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1588     if (*(++ptr) == '^') ptr++;
1589     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1590     if (*ptr == terminator && ptr[1] == ']')
1591     {
1592     *endptr = ptr;
1593     return TRUE;
1594     }
1595     return FALSE;
1596     }
1597    
1598    
1599    
1600    
1601     /*************************************************
1602     * Check POSIX class name *
1603     *************************************************/
1604    
1605     /* This function is called to check the name given in a POSIX-style class entry
1606     such as [:alnum:].
1607    
1608     Arguments:
1609     ptr points to the first letter
1610     len the length of the name
1611    
1612     Returns: a value representing the name, or -1 if unknown
1613     */
1614    
1615     static int
1616     check_posix_name(const uschar *ptr, int len)
1617     {
1618     register int yield = 0;
1619     while (posix_name_lengths[yield] != 0)
1620     {
1621     if (len == posix_name_lengths[yield] &&
1622     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1623     yield++;
1624     }
1625     return -1;
1626     }
1627    
1628    
1629     /*************************************************
1630     * Adjust OP_RECURSE items in repeated group *
1631     *************************************************/
1632    
1633     /* OP_RECURSE items contain an offset from the start of the regex to the group
1634     that is referenced. This means that groups can be replicated for fixed
1635     repetition simply by copying (because the recursion is allowed to refer to
1636     earlier groups that are outside the current group). However, when a group is
1637     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638     it, after it has been compiled. This means that any OP_RECURSE items within it
1639     that refer to the group itself or any contained groups have to have their
1640 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1641     the partially compiled regex must be temporarily terminated with OP_END.
1642 nigel 77
1643 nigel 93 This function has been extended with the possibility of forward references for
1644     recursions and subroutine calls. It must also check the list of such references
1645     for the group we are dealing with. If it finds that one of the recursions in
1646     the current group is on this list, it adjusts the offset in the list, not the
1647     value in the reference (which is a group number).
1648    
1649 nigel 77 Arguments:
1650     group points to the start of the group
1651     adjust the amount by which the group is to be moved
1652     utf8 TRUE in UTF-8 mode
1653     cd contains pointers to tables etc.
1654 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1655 nigel 77
1656     Returns: nothing
1657     */
1658    
1659     static void
1660 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661     uschar *save_hwm)
1662 nigel 77 {
1663     uschar *ptr = group;
1664     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665     {
1666 nigel 93 int offset;
1667     uschar *hc;
1668    
1669     /* See if this recursion is on the forward reference list. If so, adjust the
1670     reference. */
1671    
1672     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673     {
1674     offset = GET(hc, 0);
1675     if (cd->start_code + offset == ptr + 1)
1676     {
1677     PUT(hc, 0, offset + adjust);
1678     break;
1679     }
1680     }
1681    
1682     /* Otherwise, adjust the recursion offset if it's after the start of this
1683     group. */
1684    
1685     if (hc >= cd->hwm)
1686     {
1687     offset = GET(ptr, 1);
1688     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689     }
1690    
1691 nigel 77 ptr += 1 + LINK_SIZE;
1692     }
1693     }
1694    
1695    
1696    
1697     /*************************************************
1698     * Insert an automatic callout point *
1699     *************************************************/
1700    
1701     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1702     callout points before each pattern item.
1703    
1704     Arguments:
1705     code current code pointer
1706     ptr current pattern pointer
1707     cd pointers to tables etc
1708    
1709     Returns: new code pointer
1710     */
1711    
1712     static uschar *
1713     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1714     {
1715     *code++ = OP_CALLOUT;
1716     *code++ = 255;
1717     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1718     PUT(code, LINK_SIZE, 0); /* Default length */
1719     return code + 2*LINK_SIZE;
1720     }
1721    
1722    
1723    
1724     /*************************************************
1725     * Complete a callout item *
1726     *************************************************/
1727    
1728     /* A callout item contains the length of the next item in the pattern, which
1729     we can't fill in till after we have reached the relevant point. This is used
1730     for both automatic and manual callouts.
1731    
1732     Arguments:
1733     previous_callout points to previous callout item
1734     ptr current pattern pointer
1735     cd pointers to tables etc
1736    
1737     Returns: nothing
1738     */
1739    
1740     static void
1741     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1742     {
1743     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1744     PUT(previous_callout, 2 + LINK_SIZE, length);
1745     }
1746    
1747    
1748    
1749     #ifdef SUPPORT_UCP
1750     /*************************************************
1751     * Get othercase range *
1752     *************************************************/
1753    
1754     /* This function is passed the start and end of a class range, in UTF-8 mode
1755     with UCP support. It searches up the characters, looking for internal ranges of
1756     characters in the "other" case. Each call returns the next one, updating the
1757     start address.
1758    
1759     Arguments:
1760     cptr points to starting character value; updated
1761     d end value
1762     ocptr where to put start of othercase range
1763     odptr where to put end of othercase range
1764    
1765     Yield: TRUE when range returned; FALSE when no more
1766     */
1767    
1768     static BOOL
1769 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770     unsigned int *odptr)
1771 nigel 77 {
1772 nigel 93 unsigned int c, othercase, next;
1773 nigel 77
1774     for (c = *cptr; c <= d; c++)
1775 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776 nigel 77
1777     if (c > d) return FALSE;
1778    
1779     *ocptr = othercase;
1780     next = othercase + 1;
1781    
1782     for (++c; c <= d; c++)
1783     {
1784 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1785 nigel 77 next++;
1786     }
1787    
1788     *odptr = next - 1;
1789     *cptr = c;
1790    
1791     return TRUE;
1792     }
1793     #endif /* SUPPORT_UCP */
1794    
1795    
1796 nigel 93
1797 nigel 77 /*************************************************
1798 nigel 93 * Check if auto-possessifying is possible *
1799     *************************************************/
1800    
1801     /* This function is called for unlimited repeats of certain items, to see
1802     whether the next thing could possibly match the repeated item. If not, it makes
1803     sense to automatically possessify the repeated item.
1804    
1805     Arguments:
1806     op_code the repeated op code
1807     this data for this item, depends on the opcode
1808     utf8 TRUE in UTF-8 mode
1809     utf8_char used for utf8 character bytes, NULL if not relevant
1810     ptr next character in pattern
1811     options options bits
1812     cd contains pointers to tables etc.
1813    
1814     Returns: TRUE if possessifying is wanted
1815     */
1816    
1817     static BOOL
1818     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819     const uschar *ptr, int options, compile_data *cd)
1820     {
1821     int next;
1822    
1823     /* Skip whitespace and comments in extended mode */
1824    
1825     if ((options & PCRE_EXTENDED) != 0)
1826     {
1827     for (;;)
1828     {
1829     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830     if (*ptr == '#')
1831     {
1832     while (*(++ptr) != 0)
1833     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834     }
1835     else break;
1836     }
1837     }
1838    
1839     /* If the next item is one that we can handle, get its value. A non-negative
1840     value is a character, a negative value is an escape value. */
1841    
1842     if (*ptr == '\\')
1843     {
1844     int temperrorcode = 0;
1845     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846     if (temperrorcode != 0) return FALSE;
1847     ptr++; /* Point after the escape sequence */
1848     }
1849    
1850     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851     {
1852     #ifdef SUPPORT_UTF8
1853     if (utf8) { GETCHARINC(next, ptr); } else
1854     #endif
1855     next = *ptr++;
1856     }
1857    
1858     else return FALSE;
1859    
1860     /* Skip whitespace and comments in extended mode */
1861    
1862     if ((options & PCRE_EXTENDED) != 0)
1863     {
1864     for (;;)
1865     {
1866     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867     if (*ptr == '#')
1868     {
1869     while (*(++ptr) != 0)
1870     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871     }
1872     else break;
1873     }
1874     }
1875    
1876     /* If the next thing is itself optional, we have to give up. */
1877    
1878     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879     return FALSE;
1880    
1881     /* Now compare the next item with the previous opcode. If the previous is a
1882     positive single character match, "item" either contains the character or, if
1883     "item" is greater than 127 in utf8 mode, the character's bytes are in
1884     utf8_char. */
1885    
1886    
1887     /* Handle cases when the next item is a character. */
1888    
1889     if (next >= 0) switch(op_code)
1890     {
1891     case OP_CHAR:
1892     #ifdef SUPPORT_UTF8
1893     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894     #endif
1895     return item != next;
1896    
1897     /* For CHARNC (caseless character) we must check the other case. If we have
1898     Unicode property support, we can use it to test the other case of
1899     high-valued characters. */
1900    
1901     case OP_CHARNC:
1902     #ifdef SUPPORT_UTF8
1903     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904     #endif
1905     if (item == next) return FALSE;
1906     #ifdef SUPPORT_UTF8
1907     if (utf8)
1908     {
1909     unsigned int othercase;
1910     if (next < 128) othercase = cd->fcc[next]; else
1911     #ifdef SUPPORT_UCP
1912     othercase = _pcre_ucp_othercase((unsigned int)next);
1913     #else
1914     othercase = NOTACHAR;
1915     #endif
1916     return (unsigned int)item != othercase;
1917     }
1918     else
1919     #endif /* SUPPORT_UTF8 */
1920     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1921    
1922     /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924     case OP_NOT:
1925     if (next < 0) return FALSE; /* Not a character */
1926     if (item == next) return TRUE;
1927     if ((options & PCRE_CASELESS) == 0) return FALSE;
1928     #ifdef SUPPORT_UTF8
1929     if (utf8)
1930     {
1931     unsigned int othercase;
1932     if (next < 128) othercase = cd->fcc[next]; else
1933     #ifdef SUPPORT_UCP
1934     othercase = _pcre_ucp_othercase(next);
1935     #else
1936     othercase = NOTACHAR;
1937     #endif
1938     return (unsigned int)item == othercase;
1939     }
1940     else
1941     #endif /* SUPPORT_UTF8 */
1942     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1943    
1944     case OP_DIGIT:
1945     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947     case OP_NOT_DIGIT:
1948     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950     case OP_WHITESPACE:
1951     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953     case OP_NOT_WHITESPACE:
1954     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956     case OP_WORDCHAR:
1957     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959     case OP_NOT_WORDCHAR:
1960     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962 ph10 180 case OP_HSPACE:
1963     case OP_NOT_HSPACE:
1964     switch(next)
1965     {
1966     case 0x09:
1967     case 0x20:
1968     case 0xa0:
1969     case 0x1680:
1970     case 0x180e:
1971     case 0x2000:
1972     case 0x2001:
1973     case 0x2002:
1974     case 0x2003:
1975     case 0x2004:
1976     case 0x2005:
1977     case 0x2006:
1978     case 0x2007:
1979     case 0x2008:
1980     case 0x2009:
1981     case 0x200A:
1982     case 0x202f:
1983     case 0x205f:
1984     case 0x3000:
1985     return op_code != OP_HSPACE;
1986     default:
1987     return op_code == OP_HSPACE;
1988     }
1989    
1990     case OP_VSPACE:
1991     case OP_NOT_VSPACE:
1992     switch(next)
1993     {
1994     case 0x0a:
1995     case 0x0b:
1996     case 0x0c:
1997     case 0x0d:
1998     case 0x85:
1999     case 0x2028:
2000     case 0x2029:
2001     return op_code != OP_VSPACE;
2002     default:
2003     return op_code == OP_VSPACE;
2004     }
2005    
2006 nigel 93 default:
2007     return FALSE;
2008     }
2009    
2010    
2011     /* Handle the case when the next item is \d, \s, etc. */
2012    
2013     switch(op_code)
2014     {
2015     case OP_CHAR:
2016     case OP_CHARNC:
2017     #ifdef SUPPORT_UTF8
2018     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019     #endif
2020     switch(-next)
2021     {
2022     case ESC_d:
2023     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025     case ESC_D:
2026     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028     case ESC_s:
2029     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031     case ESC_S:
2032     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034     case ESC_w:
2035     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037     case ESC_W:
2038     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039 ph10 182
2040 ph10 180 case ESC_h:
2041     case ESC_H:
2042     switch(item)
2043     {
2044     case 0x09:
2045     case 0x20:
2046     case 0xa0:
2047     case 0x1680:
2048     case 0x180e:
2049     case 0x2000:
2050     case 0x2001:
2051     case 0x2002:
2052     case 0x2003:
2053     case 0x2004:
2054     case 0x2005:
2055     case 0x2006:
2056     case 0x2007:
2057     case 0x2008:
2058     case 0x2009:
2059     case 0x200A:
2060     case 0x202f:
2061     case 0x205f:
2062     case 0x3000:
2063     return -next != ESC_h;
2064     default:
2065     return -next == ESC_h;
2066 ph10 182 }
2067    
2068 ph10 180 case ESC_v:
2069     case ESC_V:
2070     switch(item)
2071     {
2072     case 0x0a:
2073     case 0x0b:
2074     case 0x0c:
2075     case 0x0d:
2076     case 0x85:
2077     case 0x2028:
2078     case 0x2029:
2079     return -next != ESC_v;
2080     default:
2081     return -next == ESC_v;
2082 ph10 182 }
2083 nigel 93
2084     default:
2085     return FALSE;
2086     }
2087    
2088     case OP_DIGIT:
2089 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090     next == -ESC_h || next == -ESC_v;
2091 nigel 93
2092     case OP_NOT_DIGIT:
2093     return next == -ESC_d;
2094    
2095     case OP_WHITESPACE:
2096     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098     case OP_NOT_WHITESPACE:
2099 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100 nigel 93
2101 ph10 180 case OP_HSPACE:
2102     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104     case OP_NOT_HSPACE:
2105     return next == -ESC_h;
2106 ph10 182
2107 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108 ph10 182 case OP_VSPACE:
2109 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111     case OP_NOT_VSPACE:
2112 ph10 182 return next == -ESC_v;
2113 ph10 180
2114 nigel 93 case OP_WORDCHAR:
2115 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116 nigel 93
2117     case OP_NOT_WORDCHAR:
2118     return next == -ESC_w || next == -ESC_d;
2119 ph10 182
2120 nigel 93 default:
2121     return FALSE;
2122     }
2123    
2124     /* Control does not reach here */
2125     }
2126    
2127    
2128    
2129     /*************************************************
2130 nigel 77 * Compile one branch *
2131     *************************************************/
2132    
2133 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2134 nigel 77 changed during the branch, the pointer is used to change the external options
2135 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2136     to find out the amount of memory needed, as well as during the real compile
2137     phase. The value of lengthptr distinguishes the two phases.
2138 nigel 77
2139     Arguments:
2140     optionsptr pointer to the option bits
2141     codeptr points to the pointer to the current code point
2142     ptrptr points to the current pattern pointer
2143     errorcodeptr points to error code variable
2144     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145     reqbyteptr set to the last literal character required, else < 0
2146     bcptr points to current branch chain
2147     cd contains pointers to tables etc.
2148 nigel 93 lengthptr NULL during the real compile phase
2149     points to length accumulator during pre-compile phase
2150 nigel 77
2151     Returns: TRUE on success
2152     FALSE, with *errorcodeptr set non-zero on error
2153     */
2154    
2155     static BOOL
2156 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158     compile_data *cd, int *lengthptr)
2159 nigel 77 {
2160     int repeat_type, op_type;
2161     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2162     int bravalue = 0;
2163     int greedy_default, greedy_non_default;
2164     int firstbyte, reqbyte;
2165     int zeroreqbyte, zerofirstbyte;
2166     int req_caseopt, reqvary, tempreqvary;
2167     int options = *optionsptr;
2168     int after_manual_callout = 0;
2169 nigel 93 int length_prevgroup = 0;
2170 nigel 77 register int c;
2171     register uschar *code = *codeptr;
2172 nigel 93 uschar *last_code = code;
2173     uschar *orig_code = code;
2174 nigel 77 uschar *tempcode;
2175     BOOL inescq = FALSE;
2176     BOOL groupsetfirstbyte = FALSE;
2177     const uschar *ptr = *ptrptr;
2178     const uschar *tempptr;
2179     uschar *previous = NULL;
2180     uschar *previous_callout = NULL;
2181 nigel 93 uschar *save_hwm = NULL;
2182 nigel 77 uschar classbits[32];
2183    
2184     #ifdef SUPPORT_UTF8
2185     BOOL class_utf8;
2186     BOOL utf8 = (options & PCRE_UTF8) != 0;
2187     uschar *class_utf8data;
2188     uschar utf8_char[6];
2189     #else
2190     BOOL utf8 = FALSE;
2191 nigel 93 uschar *utf8_char = NULL;
2192 nigel 77 #endif
2193    
2194 nigel 93 #ifdef DEBUG
2195     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196     #endif
2197    
2198 nigel 77 /* Set up the default and non-default settings for greediness */
2199    
2200     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201     greedy_non_default = greedy_default ^ 1;
2202    
2203     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205     matches a non-fixed char first char; reqbyte just remains unset if we never
2206     find one.
2207    
2208     When we hit a repeat whose minimum is zero, we may have to adjust these values
2209     to take the zero repeat into account. This is implemented by setting them to
2210     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211     item types that can be repeated set these backoff variables appropriately. */
2212    
2213     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214    
2215     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217     value > 255. It is added into the firstbyte or reqbyte variables to record the
2218     case status of the value. This is used only for ASCII characters. */
2219    
2220     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221    
2222     /* Switch on next character until the end of the branch */
2223    
2224     for (;; ptr++)
2225     {
2226     BOOL negate_class;
2227     BOOL possessive_quantifier;
2228     BOOL is_quantifier;
2229 nigel 93 BOOL is_recurse;
2230 ph10 180 BOOL reset_bracount;
2231 nigel 77 int class_charcount;
2232     int class_lastchar;
2233     int newoptions;
2234     int recno;
2235 ph10 172 int refsign;
2236 nigel 77 int skipbytes;
2237     int subreqbyte;
2238     int subfirstbyte;
2239 nigel 93 int terminator;
2240 nigel 77 int mclength;
2241     uschar mcbuffer[8];
2242    
2243 nigel 93 /* Get next byte in the pattern */
2244 nigel 77
2245     c = *ptr;
2246    
2247 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2248     previous cycle of this loop. */
2249    
2250     if (lengthptr != NULL)
2251     {
2252     #ifdef DEBUG
2253     if (code > cd->hwm) cd->hwm = code; /* High water info */
2254     #endif
2255     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256     {
2257     *errorcodeptr = ERR52;
2258     goto FAILED;
2259     }
2260    
2261     /* There is at least one situation where code goes backwards: this is the
2262     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263     the class is simply eliminated. However, it is created first, so we have to
2264     allow memory for it. Therefore, don't ever reduce the length at this point.
2265     */
2266    
2267     if (code < last_code) code = last_code;
2268 ph10 202
2269     /* Paranoid check for integer overflow */
2270    
2271     if (OFLOW_MAX - *lengthptr < code - last_code)
2272     {
2273     *errorcodeptr = ERR20;
2274     goto FAILED;
2275     }
2276    
2277 nigel 93 *lengthptr += code - last_code;
2278     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280     /* If "previous" is set and it is not at the start of the work space, move
2281     it back to there, in order to avoid filling up the work space. Otherwise,
2282     if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284     if (previous != NULL)
2285     {
2286     if (previous > orig_code)
2287     {
2288     memmove(orig_code, previous, code - previous);
2289     code -= previous - orig_code;
2290     previous = orig_code;
2291     }
2292     }
2293     else code = orig_code;
2294    
2295     /* Remember where this code item starts so we can pick up the length
2296     next time round. */
2297    
2298     last_code = code;
2299     }
2300    
2301     /* In the real compile phase, just check the workspace used by the forward
2302     reference list. */
2303    
2304     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305     {
2306     *errorcodeptr = ERR52;
2307     goto FAILED;
2308     }
2309    
2310 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312     if (inescq && c != 0)
2313     {
2314     if (c == '\\' && ptr[1] == 'E')
2315     {
2316     inescq = FALSE;
2317     ptr++;
2318     continue;
2319     }
2320     else
2321     {
2322     if (previous_callout != NULL)
2323     {
2324 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2325     complete_callout(previous_callout, ptr, cd);
2326 nigel 77 previous_callout = NULL;
2327     }
2328     if ((options & PCRE_AUTO_CALLOUT) != 0)
2329     {
2330     previous_callout = code;
2331     code = auto_callout(code, ptr, cd);
2332     }
2333     goto NORMAL_CHAR;
2334     }
2335     }
2336    
2337     /* Fill in length of a previous callout, except when the next thing is
2338     a quantifier. */
2339    
2340     is_quantifier = c == '*' || c == '+' || c == '?' ||
2341     (c == '{' && is_counted_repeat(ptr+1));
2342    
2343     if (!is_quantifier && previous_callout != NULL &&
2344     after_manual_callout-- <= 0)
2345     {
2346 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2347     complete_callout(previous_callout, ptr, cd);
2348 nigel 77 previous_callout = NULL;
2349     }
2350    
2351     /* In extended mode, skip white space and comments */
2352    
2353     if ((options & PCRE_EXTENDED) != 0)
2354     {
2355     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356     if (c == '#')
2357     {
2358 nigel 93 while (*(++ptr) != 0)
2359 nigel 91 {
2360 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361 nigel 91 }
2362 nigel 93 if (*ptr != 0) continue;
2363    
2364 nigel 91 /* Else fall through to handle end of string */
2365     c = 0;
2366 nigel 77 }
2367     }
2368    
2369     /* No auto callout for quantifiers. */
2370    
2371     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2372     {
2373     previous_callout = code;
2374     code = auto_callout(code, ptr, cd);
2375     }
2376    
2377     switch(c)
2378     {
2379 nigel 93 /* ===================================================================*/
2380     case 0: /* The branch terminates at string end */
2381     case '|': /* or | or ) */
2382 nigel 77 case ')':
2383     *firstbyteptr = firstbyte;
2384     *reqbyteptr = reqbyte;
2385     *codeptr = code;
2386     *ptrptr = ptr;
2387 nigel 93 if (lengthptr != NULL)
2388     {
2389 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2390     {
2391     *errorcodeptr = ERR20;
2392     goto FAILED;
2393     }
2394 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2395     DPRINTF((">> end branch\n"));
2396     }
2397 nigel 77 return TRUE;
2398    
2399 nigel 93
2400     /* ===================================================================*/
2401 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2402     the setting of any following char as a first character. */
2403    
2404     case '^':
2405     if ((options & PCRE_MULTILINE) != 0)
2406     {
2407     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2408     }
2409     previous = NULL;
2410     *code++ = OP_CIRC;
2411     break;
2412    
2413     case '$':
2414     previous = NULL;
2415     *code++ = OP_DOLL;
2416     break;
2417    
2418     /* There can never be a first char if '.' is first, whatever happens about
2419     repeats. The value of reqbyte doesn't change either. */
2420    
2421     case '.':
2422     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2423     zerofirstbyte = firstbyte;
2424     zeroreqbyte = reqbyte;
2425     previous = code;
2426     *code++ = OP_ANY;
2427     break;
2428    
2429 nigel 93
2430     /* ===================================================================*/
2431 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2432     32-byte bitmap of the permitted characters, except in the special case
2433     where there is only one such character. For negated classes, we build the
2434     map as usual, then invert it at the end. However, we use a different opcode
2435     so that data characters > 255 can be handled correctly.
2436 nigel 77
2437     If the class contains characters outside the 0-255 range, a different
2438     opcode is compiled. It may optionally have a bit map for characters < 256,
2439     but those above are are explicitly listed afterwards. A flag byte tells
2440     whether the bitmap is present, and whether this is a negated class or not.
2441     */
2442    
2443     case '[':
2444     previous = code;
2445    
2446     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2447     they are encountered at the top level, so we'll do that too. */
2448    
2449     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2450     check_posix_syntax(ptr, &tempptr, cd))
2451     {
2452     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2453     goto FAILED;
2454     }
2455    
2456 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2457     if the first few characters (either before or after ^) are \Q\E or \E we
2458     skip them too. This makes for compatibility with Perl. */
2459    
2460     negate_class = FALSE;
2461     for (;;)
2462 nigel 77 {
2463     c = *(++ptr);
2464 ph10 205 if (c == '\\')
2465     {
2466     if (ptr[1] == 'E') ptr++;
2467     else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2468     else break;
2469     }
2470     else if (!negate_class && c == '^')
2471     negate_class = TRUE;
2472     else break;
2473     }
2474 nigel 77
2475     /* Keep a count of chars with values < 256 so that we can optimize the case
2476 nigel 93 of just a single character (as long as it's < 256). However, For higher
2477     valued UTF-8 characters, we don't yet do any optimization. */
2478 nigel 77
2479     class_charcount = 0;
2480     class_lastchar = -1;
2481    
2482 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2483     temporary bit of memory, in case the class contains only 1 character (less
2484     than 256), because in that case the compiled code doesn't use the bit map.
2485     */
2486    
2487     memset(classbits, 0, 32 * sizeof(uschar));
2488    
2489 nigel 77 #ifdef SUPPORT_UTF8
2490     class_utf8 = FALSE; /* No chars >= 256 */
2491 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2492 nigel 77 #endif
2493    
2494     /* Process characters until ] is reached. By writing this as a "do" it
2495 nigel 93 means that an initial ] is taken as a data character. At the start of the
2496     loop, c contains the first byte of the character. */
2497 nigel 77
2498 nigel 93 if (c != 0) do
2499 nigel 77 {
2500 nigel 93 const uschar *oldptr;
2501    
2502 nigel 77 #ifdef SUPPORT_UTF8
2503     if (utf8 && c > 127)
2504     { /* Braces are required because the */
2505     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2506     }
2507     #endif
2508    
2509     /* Inside \Q...\E everything is literal except \E */
2510    
2511     if (inescq)
2512     {
2513 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2514 nigel 77 {
2515 nigel 93 inescq = FALSE; /* Reset literal state */
2516     ptr++; /* Skip the 'E' */
2517     continue; /* Carry on with next */
2518 nigel 77 }
2519 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2520 nigel 77 }
2521    
2522     /* Handle POSIX class names. Perl allows a negation extension of the
2523     form [:^name:]. A square bracket that doesn't match the syntax is
2524     treated as a literal. We also recognize the POSIX constructions
2525     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2526     5.6 and 5.8 do. */
2527    
2528     if (c == '[' &&
2529     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2530     check_posix_syntax(ptr, &tempptr, cd))
2531     {
2532     BOOL local_negate = FALSE;
2533 nigel 87 int posix_class, taboffset, tabopt;
2534 nigel 77 register const uschar *cbits = cd->cbits;
2535 nigel 87 uschar pbits[32];
2536 nigel 77
2537     if (ptr[1] != ':')
2538     {
2539     *errorcodeptr = ERR31;
2540     goto FAILED;
2541     }
2542    
2543     ptr += 2;
2544     if (*ptr == '^')
2545     {
2546     local_negate = TRUE;
2547     ptr++;
2548     }
2549    
2550     posix_class = check_posix_name(ptr, tempptr - ptr);
2551     if (posix_class < 0)
2552     {
2553     *errorcodeptr = ERR30;
2554     goto FAILED;
2555     }
2556    
2557     /* If matching is caseless, upper and lower are converted to
2558     alpha. This relies on the fact that the class table starts with
2559     alpha, lower, upper as the first 3 entries. */
2560    
2561     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2562     posix_class = 0;
2563    
2564 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2565     because we may be adding and subtracting from it, and we don't want to
2566     subtract bits that may be in the main map already. At the end we or the
2567     result into the bit map that is being built. */
2568 nigel 77
2569     posix_class *= 3;
2570 nigel 87
2571     /* Copy in the first table (always present) */
2572    
2573     memcpy(pbits, cbits + posix_class_maps[posix_class],
2574     32 * sizeof(uschar));
2575    
2576     /* If there is a second table, add or remove it as required. */
2577    
2578     taboffset = posix_class_maps[posix_class + 1];
2579     tabopt = posix_class_maps[posix_class + 2];
2580    
2581     if (taboffset >= 0)
2582 nigel 77 {
2583 nigel 87 if (tabopt >= 0)
2584     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2585 nigel 77 else
2586 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2587 nigel 77 }
2588    
2589 nigel 87 /* Not see if we need to remove any special characters. An option
2590     value of 1 removes vertical space and 2 removes underscore. */
2591    
2592     if (tabopt < 0) tabopt = -tabopt;
2593     if (tabopt == 1) pbits[1] &= ~0x3c;
2594     else if (tabopt == 2) pbits[11] &= 0x7f;
2595    
2596     /* Add the POSIX table or its complement into the main table that is
2597     being built and we are done. */
2598    
2599     if (local_negate)
2600     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2601     else
2602     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2603    
2604 nigel 77 ptr = tempptr + 1;
2605     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2606     continue; /* End of POSIX syntax handling */
2607     }
2608    
2609     /* Backslash may introduce a single character, or it may introduce one
2610 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2611     case. Inside a class (and only there) it is treated as backspace.
2612     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2613 ph10 205 to 'or' into the one we are building. We assume they have more than one
2614 nigel 77 character in them, so set class_charcount bigger than one. */
2615    
2616     if (c == '\\')
2617     {
2618 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2619     if (*errorcodeptr != 0) goto FAILED;
2620 nigel 77
2621     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2622     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2623 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2624 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2625     {
2626     if (ptr[1] == '\\' && ptr[2] == 'E')
2627     {
2628     ptr += 2; /* avoid empty string */
2629     }
2630     else inescq = TRUE;
2631     continue;
2632     }
2633    
2634     if (c < 0)
2635     {
2636     register const uschar *cbits = cd->cbits;
2637     class_charcount += 2; /* Greater than 1 is what matters */
2638 nigel 93
2639     /* Save time by not doing this in the pre-compile phase. */
2640    
2641     if (lengthptr == NULL) switch (-c)
2642 nigel 77 {
2643     case ESC_d:
2644     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2645     continue;
2646    
2647     case ESC_D:
2648     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2649     continue;
2650    
2651     case ESC_w:
2652     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2653     continue;
2654    
2655     case ESC_W:
2656     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2657     continue;
2658    
2659     case ESC_s:
2660     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2661     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2662     continue;
2663    
2664     case ESC_S:
2665     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2666     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2667     continue;
2668    
2669 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2670     continue;
2671 ph10 180
2672 nigel 93 default: /* Not recognized; fall through */
2673     break; /* Need "default" setting to stop compiler warning. */
2674     }
2675    
2676     /* In the pre-compile phase, just do the recognition. */
2677    
2678     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2679     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2680 ph10 180
2681 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2682     they use extra memory. */
2683 ph10 180
2684 ph10 178 if (-c == ESC_h)
2685     {
2686     SETBIT(classbits, 0x09); /* VT */
2687     SETBIT(classbits, 0x20); /* SPACE */
2688 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2689 ph10 178 #ifdef SUPPORT_UTF8
2690     if (utf8)
2691 ph10 180 {
2692 ph10 178 class_utf8 = TRUE;
2693     *class_utf8data++ = XCL_SINGLE;
2694 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2695 ph10 178 *class_utf8data++ = XCL_SINGLE;
2696 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2697     *class_utf8data++ = XCL_RANGE;
2698     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2699     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2700 ph10 178 *class_utf8data++ = XCL_SINGLE;
2701 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2702 ph10 178 *class_utf8data++ = XCL_SINGLE;
2703 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2704 ph10 178 *class_utf8data++ = XCL_SINGLE;
2705 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2706     }
2707     #endif
2708     continue;
2709     }
2710 nigel 93
2711 ph10 178 if (-c == ESC_H)
2712     {
2713     for (c = 0; c < 32; c++)
2714     {
2715     int x = 0xff;
2716     switch (c)
2717 ph10 180 {
2718 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2719     case 0x20/8: x ^= 1 << (0x20%8); break;
2720     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2721     default: break;
2722     }
2723     classbits[c] |= x;
2724 ph10 180 }
2725    
2726 ph10 178 #ifdef SUPPORT_UTF8
2727     if (utf8)
2728 ph10 180 {
2729 ph10 178 class_utf8 = TRUE;
2730 ph10 180 *class_utf8data++ = XCL_RANGE;
2731     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2732     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2733     *class_utf8data++ = XCL_RANGE;
2734     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2735     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2736     *class_utf8data++ = XCL_RANGE;
2737     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2738     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2739     *class_utf8data++ = XCL_RANGE;
2740     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2741     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2742     *class_utf8data++ = XCL_RANGE;
2743     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2744     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2745     *class_utf8data++ = XCL_RANGE;
2746     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2747     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2748     *class_utf8data++ = XCL_RANGE;
2749     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2750     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2751     }
2752     #endif
2753     continue;
2754     }
2755 ph10 178
2756     if (-c == ESC_v)
2757     {
2758     SETBIT(classbits, 0x0a); /* LF */
2759     SETBIT(classbits, 0x0b); /* VT */
2760 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2761     SETBIT(classbits, 0x0d); /* CR */
2762     SETBIT(classbits, 0x85); /* NEL */
2763 ph10 178 #ifdef SUPPORT_UTF8
2764     if (utf8)
2765 ph10 180 {
2766 ph10 178 class_utf8 = TRUE;
2767 ph10 180 *class_utf8data++ = XCL_RANGE;
2768     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2769     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2770     }
2771     #endif
2772     continue;
2773     }
2774 ph10 178
2775     if (-c == ESC_V)
2776     {
2777     for (c = 0; c < 32; c++)
2778     {
2779     int x = 0xff;
2780     switch (c)
2781 ph10 180 {
2782 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2783     x ^= 1 << (0x0b%8);
2784     x ^= 1 << (0x0c%8);
2785 ph10 180 x ^= 1 << (0x0d%8);
2786 ph10 178 break;
2787     case 0x85/8: x ^= 1 << (0x85%8); break;
2788     default: break;
2789     }
2790     classbits[c] |= x;
2791 ph10 180 }
2792    
2793 ph10 178 #ifdef SUPPORT_UTF8
2794     if (utf8)
2795 ph10 180 {
2796 ph10 178 class_utf8 = TRUE;
2797 ph10 180 *class_utf8data++ = XCL_RANGE;
2798     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2799     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2800     *class_utf8data++ = XCL_RANGE;
2801     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2802     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2803     }
2804     #endif
2805     continue;
2806     }
2807 ph10 178
2808 nigel 93 /* We need to deal with \P and \p in both phases. */
2809    
2810 nigel 77 #ifdef SUPPORT_UCP
2811 nigel 93 if (-c == ESC_p || -c == ESC_P)
2812     {
2813     BOOL negated;
2814     int pdata;
2815     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2816     if (ptype < 0) goto FAILED;
2817     class_utf8 = TRUE;
2818     *class_utf8data++ = ((-c == ESC_p) != negated)?
2819     XCL_PROP : XCL_NOTPROP;
2820     *class_utf8data++ = ptype;
2821     *class_utf8data++ = pdata;
2822     class_charcount -= 2; /* Not a < 256 character */
2823 nigel 77 continue;
2824 nigel 93 }
2825 nigel 77 #endif
2826 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2827     strict mode. By default, for compatibility with Perl, they are
2828     treated as literals. */
2829 nigel 77
2830 nigel 93 if ((options & PCRE_EXTRA) != 0)
2831     {
2832     *errorcodeptr = ERR7;
2833     goto FAILED;
2834     }
2835 nigel 77
2836 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2837     c = *ptr; /* Get the final character and fall through */
2838 nigel 77 }
2839    
2840     /* Fall through if we have a single character (c >= 0). This may be
2841 nigel 93 greater than 256 in UTF-8 mode. */
2842 nigel 77
2843     } /* End of backslash handling */
2844    
2845     /* A single character may be followed by '-' to form a range. However,
2846     Perl does not permit ']' to be the end of the range. A '-' character
2847 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2848     entirely. The code for handling \Q and \E is messy. */
2849 nigel 77
2850 nigel 93 CHECK_RANGE:
2851     while (ptr[1] == '\\' && ptr[2] == 'E')
2852 nigel 77 {
2853 nigel 93 inescq = FALSE;
2854     ptr += 2;
2855     }
2856    
2857     oldptr = ptr;
2858    
2859     if (!inescq && ptr[1] == '-')
2860     {
2861 nigel 77 int d;
2862     ptr += 2;
2863 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2864 nigel 77
2865 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2866     mode. */
2867    
2868     while (*ptr == '\\' && ptr[1] == 'Q')
2869     {
2870     ptr += 2;
2871     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2872     inescq = TRUE;
2873     break;
2874     }
2875    
2876     if (*ptr == 0 || (!inescq && *ptr == ']'))
2877     {
2878     ptr = oldptr;
2879     goto LONE_SINGLE_CHARACTER;
2880     }
2881    
2882 nigel 77 #ifdef SUPPORT_UTF8
2883     if (utf8)
2884     { /* Braces are required because the */
2885     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2886     }
2887     else
2888     #endif
2889     d = *ptr; /* Not UTF-8 mode */
2890    
2891     /* The second part of a range can be a single-character escape, but
2892     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2893     in such circumstances. */
2894    
2895 nigel 93 if (!inescq && d == '\\')
2896 nigel 77 {
2897 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2898     if (*errorcodeptr != 0) goto FAILED;
2899 nigel 77
2900 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2901     special means the '-' was literal */
2902 nigel 77
2903     if (d < 0)
2904     {
2905     if (d == -ESC_b) d = '\b';
2906 nigel 93 else if (d == -ESC_X) d = 'X';
2907     else if (d == -ESC_R) d = 'R'; else
2908 nigel 77 {
2909 nigel 93 ptr = oldptr;
2910 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2911     }
2912     }
2913     }
2914    
2915 nigel 93 /* Check that the two values are in the correct order. Optimize
2916     one-character ranges */
2917 nigel 77
2918 nigel 93 if (d < c)
2919     {
2920     *errorcodeptr = ERR8;
2921     goto FAILED;
2922     }
2923    
2924 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2925    
2926     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2927     matching, we have to use an XCLASS with extra data items. Caseless
2928     matching for characters > 127 is available only if UCP support is
2929     available. */
2930    
2931     #ifdef SUPPORT_UTF8
2932     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2933     {
2934     class_utf8 = TRUE;
2935    
2936     /* With UCP support, we can find the other case equivalents of
2937     the relevant characters. There may be several ranges. Optimize how
2938     they fit with the basic range. */
2939    
2940     #ifdef SUPPORT_UCP
2941     if ((options & PCRE_CASELESS) != 0)
2942     {
2943 nigel 93 unsigned int occ, ocd;
2944     unsigned int cc = c;
2945     unsigned int origd = d;
2946 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2947     {
2948 ph10 180 if (occ >= (unsigned int)c &&
2949     ocd <= (unsigned int)d)
2950 ph10 176 continue; /* Skip embedded ranges */
2951 nigel 77
2952 ph10 180 if (occ < (unsigned int)c &&
2953 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2954 nigel 77 { /* if there is overlap, */
2955     c = occ; /* noting that if occ < c */
2956     continue; /* we can't have ocd > d */
2957     } /* because a subrange is */
2958 ph10 180 if (ocd > (unsigned int)d &&
2959 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2960 nigel 77 { /* the basic range. */
2961     d = ocd;
2962     continue;
2963     }
2964    
2965     if (occ == ocd)
2966     {
2967     *class_utf8data++ = XCL_SINGLE;
2968     }
2969     else
2970     {
2971     *class_utf8data++ = XCL_RANGE;
2972     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2973     }
2974     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2975     }
2976     }
2977     #endif /* SUPPORT_UCP */
2978    
2979     /* Now record the original range, possibly modified for UCP caseless
2980     overlapping ranges. */
2981    
2982     *class_utf8data++ = XCL_RANGE;
2983     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2984     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2985    
2986     /* With UCP support, we are done. Without UCP support, there is no
2987     caseless matching for UTF-8 characters > 127; we can use the bit map
2988     for the smaller ones. */
2989    
2990     #ifdef SUPPORT_UCP
2991     continue; /* With next character in the class */
2992     #else
2993     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2994    
2995     /* Adjust upper limit and fall through to set up the map */
2996    
2997     d = 127;
2998    
2999     #endif /* SUPPORT_UCP */
3000     }
3001     #endif /* SUPPORT_UTF8 */
3002    
3003     /* We use the bit map for all cases when not in UTF-8 mode; else
3004     ranges that lie entirely within 0-127 when there is UCP support; else
3005     for partial ranges without UCP support. */
3006    
3007 nigel 93 class_charcount += d - c + 1;
3008     class_lastchar = d;
3009    
3010     /* We can save a bit of time by skipping this in the pre-compile. */
3011    
3012     if (lengthptr == NULL) for (; c <= d; c++)
3013 nigel 77 {
3014     classbits[c/8] |= (1 << (c&7));
3015     if ((options & PCRE_CASELESS) != 0)
3016     {
3017     int uc = cd->fcc[c]; /* flip case */
3018     classbits[uc/8] |= (1 << (uc&7));
3019     }
3020     }
3021    
3022     continue; /* Go get the next char in the class */
3023     }
3024    
3025     /* Handle a lone single character - we can get here for a normal
3026     non-escape char, or after \ that introduces a single character or for an
3027     apparent range that isn't. */
3028    
3029     LONE_SINGLE_CHARACTER:
3030    
3031     /* Handle a character that cannot go in the bit map */
3032    
3033     #ifdef SUPPORT_UTF8
3034     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3035     {
3036     class_utf8 = TRUE;
3037     *class_utf8data++ = XCL_SINGLE;
3038     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3039    
3040     #ifdef SUPPORT_UCP
3041     if ((options & PCRE_CASELESS) != 0)
3042     {
3043 nigel 93 unsigned int othercase;
3044     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3045 nigel 77 {
3046     *class_utf8data++ = XCL_SINGLE;
3047     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3048     }
3049     }
3050     #endif /* SUPPORT_UCP */
3051    
3052     }
3053     else
3054     #endif /* SUPPORT_UTF8 */
3055    
3056     /* Handle a single-byte character */
3057     {
3058     classbits[c/8] |= (1 << (c&7));
3059     if ((options & PCRE_CASELESS) != 0)
3060     {
3061     c = cd->fcc[c]; /* flip case */
3062     classbits[c/8] |= (1 << (c&7));
3063     }
3064     class_charcount++;
3065     class_lastchar = c;
3066     }
3067     }
3068    
3069 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3070 nigel 77
3071 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3072 nigel 77
3073 nigel 93 if (c == 0) /* Missing terminating ']' */
3074     {
3075     *errorcodeptr = ERR6;
3076     goto FAILED;
3077     }
3078 ph10 205
3079 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3080     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3081     can optimize the negative case only if there were no characters >= 128
3082     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3083     single-bytes only. This is an historical hangover. Maybe one day we can
3084     tidy these opcodes to handle multi-byte characters.
3085    
3086     The optimization throws away the bit map. We turn the item into a
3087     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3088     that OP_NOT does not support multibyte characters. In the positive case, it
3089     can cause firstbyte to be set. Otherwise, there can be no first char if
3090     this item is first, whatever repeat count may follow. In the case of
3091     reqbyte, save the previous value for reinstating. */
3092    
3093     #ifdef SUPPORT_UTF8
3094     if (class_charcount == 1 &&
3095     (!utf8 ||
3096     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3097    
3098     #else
3099     if (class_charcount == 1)
3100     #endif
3101     {
3102     zeroreqbyte = reqbyte;
3103    
3104     /* The OP_NOT opcode works on one-byte characters only. */
3105    
3106     if (negate_class)
3107     {
3108     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3109     zerofirstbyte = firstbyte;
3110     *code++ = OP_NOT;
3111     *code++ = class_lastchar;
3112     break;
3113     }
3114    
3115     /* For a single, positive character, get the value into mcbuffer, and
3116     then we can handle this with the normal one-character code. */
3117    
3118     #ifdef SUPPORT_UTF8
3119     if (utf8 && class_lastchar > 127)
3120     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3121     else
3122     #endif
3123     {
3124     mcbuffer[0] = class_lastchar;
3125     mclength = 1;
3126     }
3127     goto ONE_CHAR;
3128     } /* End of 1-char optimization */
3129    
3130     /* The general case - not the one-char optimization. If this is the first
3131     thing in the branch, there can be no first char setting, whatever the
3132     repeat count. Any reqbyte setting must remain unchanged after any kind of
3133     repeat. */
3134    
3135     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3136     zerofirstbyte = firstbyte;
3137     zeroreqbyte = reqbyte;
3138    
3139     /* If there are characters with values > 255, we have to compile an
3140     extended class, with its own opcode. If there are no characters < 256,
3141 nigel 93 we can omit the bitmap in the actual compiled code. */
3142 nigel 77
3143     #ifdef SUPPORT_UTF8
3144     if (class_utf8)
3145     {
3146     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3147     *code++ = OP_XCLASS;
3148     code += LINK_SIZE;
3149     *code = negate_class? XCL_NOT : 0;
3150    
3151 nigel 93 /* If the map is required, move up the extra data to make room for it;
3152     otherwise just move the code pointer to the end of the extra data. */
3153 nigel 77
3154     if (class_charcount > 0)
3155     {
3156     *code++ |= XCL_MAP;
3157 nigel 93 memmove(code + 32, code, class_utf8data - code);
3158 nigel 77 memcpy(code, classbits, 32);
3159 nigel 93 code = class_utf8data + 32;
3160 nigel 77 }
3161 nigel 93 else code = class_utf8data;
3162 nigel 77
3163     /* Now fill in the complete length of the item */
3164    
3165     PUT(previous, 1, code - previous);
3166     break; /* End of class handling */
3167     }
3168     #endif
3169    
3170     /* If there are no characters > 255, negate the 32-byte map if necessary,
3171     and copy it into the code vector. If this is the first thing in the branch,
3172     there can be no first char setting, whatever the repeat count. Any reqbyte
3173     setting must remain unchanged after any kind of repeat. */
3174    
3175     if (negate_class)
3176     {
3177     *code++ = OP_NCLASS;
3178 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3179     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3180 nigel 77 }
3181     else
3182     {
3183     *code++ = OP_CLASS;
3184     memcpy(code, classbits, 32);
3185     }
3186     code += 32;
3187     break;
3188    
3189 nigel 93
3190     /* ===================================================================*/
3191 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3192     has been tested above. */
3193    
3194     case '{':
3195     if (!is_quantifier) goto NORMAL_CHAR;
3196     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3197     if (*errorcodeptr != 0) goto FAILED;
3198     goto REPEAT;
3199    
3200     case '*':
3201     repeat_min = 0;
3202     repeat_max = -1;
3203     goto REPEAT;
3204    
3205     case '+':
3206     repeat_min = 1;
3207     repeat_max = -1;
3208     goto REPEAT;
3209    
3210     case '?':
3211     repeat_min = 0;
3212     repeat_max = 1;
3213    
3214     REPEAT:
3215     if (previous == NULL)
3216     {
3217     *errorcodeptr = ERR9;
3218     goto FAILED;
3219     }
3220    
3221     if (repeat_min == 0)
3222     {
3223     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3224     reqbyte = zeroreqbyte; /* Ditto */
3225     }
3226    
3227     /* Remember whether this is a variable length repeat */
3228    
3229     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3230    
3231     op_type = 0; /* Default single-char op codes */
3232     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3233    
3234     /* Save start of previous item, in case we have to move it up to make space
3235     for an inserted OP_ONCE for the additional '+' extension. */
3236    
3237     tempcode = previous;
3238    
3239     /* If the next character is '+', we have a possessive quantifier. This
3240     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3241     If the next character is '?' this is a minimizing repeat, by default,
3242     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3243     repeat type to the non-default. */
3244    
3245     if (ptr[1] == '+')
3246     {
3247     repeat_type = 0; /* Force greedy */
3248     possessive_quantifier = TRUE;
3249     ptr++;
3250     }
3251     else if (ptr[1] == '?')
3252     {
3253     repeat_type = greedy_non_default;
3254     ptr++;
3255     }
3256     else repeat_type = greedy_default;
3257    
3258     /* If previous was a character match, abolish the item and generate a
3259     repeat item instead. If a char item has a minumum of more than one, ensure
3260     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3261     the first thing in a branch because the x will have gone into firstbyte
3262     instead. */
3263    
3264     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3265     {
3266     /* Deal with UTF-8 characters that take up more than one byte. It's
3267     easier to write this out separately than try to macrify it. Use c to
3268     hold the length of the character in bytes, plus 0x80 to flag that it's a
3269     length rather than a small character. */
3270    
3271     #ifdef SUPPORT_UTF8
3272     if (utf8 && (code[-1] & 0x80) != 0)
3273     {
3274     uschar *lastchar = code - 1;
3275     while((*lastchar & 0xc0) == 0x80) lastchar--;
3276     c = code - lastchar; /* Length of UTF-8 character */
3277     memcpy(utf8_char, lastchar, c); /* Save the char */
3278     c |= 0x80; /* Flag c as a length */
3279     }
3280     else
3281     #endif
3282    
3283     /* Handle the case of a single byte - either with no UTF8 support, or
3284     with UTF-8 disabled, or for a UTF-8 character < 128. */
3285    
3286     {
3287     c = code[-1];
3288     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3289     }
3290    
3291 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3292     the line is something that cannot possibly match this character. If so,
3293     automatically possessifying this item gains some performance in the case
3294     where the match fails. */
3295    
3296     if (!possessive_quantifier &&
3297     repeat_max < 0 &&
3298     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3299     options, cd))
3300     {
3301     repeat_type = 0; /* Force greedy */
3302     possessive_quantifier = TRUE;
3303     }
3304    
3305 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3306     }
3307    
3308     /* If previous was a single negated character ([^a] or similar), we use
3309     one of the special opcodes, replacing it. The code is shared with single-
3310     character repeats by setting opt_type to add a suitable offset into
3311 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3312     currently used only for single-byte chars. */
3313 nigel 77
3314     else if (*previous == OP_NOT)
3315     {
3316     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3317     c = previous[1];
3318 nigel 93 if (!possessive_quantifier &&
3319     repeat_max < 0 &&
3320     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3321     {
3322     repeat_type = 0; /* Force greedy */
3323     possessive_quantifier = TRUE;
3324     }
3325 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3326     }
3327    
3328     /* If previous was a character type match (\d or similar), abolish it and
3329     create a suitable repeat item. The code is shared with single-character
3330     repeats by setting op_type to add a suitable offset into repeat_type. Note
3331     the the Unicode property types will be present only when SUPPORT_UCP is
3332     defined, but we don't wrap the little bits of code here because it just
3333     makes it horribly messy. */
3334    
3335     else if (*previous < OP_EODN)
3336     {
3337     uschar *oldcode;
3338 nigel 87 int prop_type, prop_value;
3339 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3340     c = *previous;
3341    
3342 nigel 93 if (!possessive_quantifier &&
3343     repeat_max < 0 &&
3344     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3345     {
3346     repeat_type = 0; /* Force greedy */
3347     possessive_quantifier = TRUE;
3348     }
3349    
3350 nigel 77 OUTPUT_SINGLE_REPEAT:
3351 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3352     {
3353     prop_type = previous[1];
3354     prop_value = previous[2];
3355     }
3356     else prop_type = prop_value = -1;
3357 nigel 77
3358     oldcode = code;
3359     code = previous; /* Usually overwrite previous item */
3360    
3361     /* If the maximum is zero then the minimum must also be zero; Perl allows
3362     this case, so we do too - by simply omitting the item altogether. */
3363    
3364     if (repeat_max == 0) goto END_REPEAT;
3365    
3366     /* All real repeats make it impossible to handle partial matching (maybe
3367     one day we will be able to remove this restriction). */
3368    
3369     if (repeat_max != 1) cd->nopartial = TRUE;
3370    
3371     /* Combine the op_type with the repeat_type */
3372    
3373     repeat_type += op_type;
3374    
3375     /* A minimum of zero is handled either as the special case * or ?, or as
3376     an UPTO, with the maximum given. */
3377    
3378     if (repeat_min == 0)
3379     {
3380     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3381     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3382     else
3383     {
3384     *code++ = OP_UPTO + repeat_type;
3385     PUT2INC(code, 0, repeat_max);
3386     }
3387     }
3388    
3389     /* A repeat minimum of 1 is optimized into some special cases. If the
3390 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3391 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3392     one less than the maximum. */
3393    
3394     else if (repeat_min == 1)
3395     {
3396     if (repeat_max == -1)
3397     *code++ = OP_PLUS + repeat_type;
3398     else
3399     {
3400     code = oldcode; /* leave previous item in place */
3401     if (repeat_max == 1) goto END_REPEAT;
3402     *code++ = OP_UPTO + repeat_type;
3403     PUT2INC(code, 0, repeat_max - 1);
3404     }
3405     }
3406    
3407     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3408     handled as an EXACT followed by an UPTO. */
3409    
3410     else
3411     {
3412     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3413     PUT2INC(code, 0, repeat_min);
3414    
3415     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3416     we have to insert the character for the previous code. For a repeated
3417 nigel 87 Unicode property match, there are two extra bytes that define the
3418 nigel 77 required property. In UTF-8 mode, long characters have their length in
3419     c, with the 0x80 bit as a flag. */
3420    
3421     if (repeat_max < 0)
3422     {
3423     #ifdef SUPPORT_UTF8
3424     if (utf8 && c >= 128)
3425     {
3426     memcpy(code, utf8_char, c & 7);
3427     code += c & 7;
3428     }
3429     else
3430     #endif
3431     {
3432     *code++ = c;
3433 nigel 87 if (prop_type >= 0)
3434     {
3435     *code++ = prop_type;
3436     *code++ = prop_value;
3437     }
3438 nigel 77 }
3439     *code++ = OP_STAR + repeat_type;
3440     }
3441    
3442     /* Else insert an UPTO if the max is greater than the min, again
3443 nigel 93 preceded by the character, for the previously inserted code. If the
3444     UPTO is just for 1 instance, we can use QUERY instead. */
3445 nigel 77
3446     else if (repeat_max != repeat_min)
3447     {
3448     #ifdef SUPPORT_UTF8
3449     if (utf8 && c >= 128)
3450     {
3451     memcpy(code, utf8_char, c & 7);
3452     code += c & 7;
3453     }
3454     else
3455     #endif
3456     *code++ = c;
3457 nigel 87 if (prop_type >= 0)
3458     {
3459     *code++ = prop_type;
3460     *code++ = prop_value;
3461     }
3462 nigel 77 repeat_max -= repeat_min;
3463 nigel 93
3464     if (repeat_max == 1)
3465     {
3466     *code++ = OP_QUERY + repeat_type;
3467     }
3468     else
3469     {
3470     *code++ = OP_UPTO + repeat_type;
3471     PUT2INC(code, 0, repeat_max);
3472     }
3473 nigel 77 }
3474     }
3475    
3476     /* The character or character type itself comes last in all cases. */
3477    
3478     #ifdef SUPPORT_UTF8
3479     if (utf8 && c >= 128)
3480     {
3481     memcpy(code, utf8_char, c & 7);
3482     code += c & 7;
3483     }
3484     else
3485     #endif
3486     *code++ = c;
3487    
3488 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3489     define the required property. */
3490 nigel 77
3491     #ifdef SUPPORT_UCP
3492 nigel 87 if (prop_type >= 0)
3493     {
3494     *code++ = prop_type;
3495     *code++ = prop_value;
3496     }
3497 nigel 77 #endif
3498     }
3499    
3500     /* If previous was a character class or a back reference, we put the repeat
3501     stuff after it, but just skip the item if the repeat was {0,0}. */
3502    
3503     else if (*previous == OP_CLASS ||
3504     *previous == OP_NCLASS ||
3505     #ifdef SUPPORT_UTF8
3506     *previous == OP_XCLASS ||
3507     #endif
3508     *previous == OP_REF)
3509     {
3510     if (repeat_max == 0)
3511     {
3512     code = previous;
3513     goto END_REPEAT;
3514     }
3515    
3516     /* All real repeats make it impossible to handle partial matching (maybe
3517     one day we will be able to remove this restriction). */
3518    
3519     if (repeat_max != 1) cd->nopartial = TRUE;
3520    
3521     if (repeat_min == 0 && repeat_max == -1)
3522     *code++ = OP_CRSTAR + repeat_type;
3523     else if (repeat_min == 1 && repeat_max == -1)
3524     *code++ = OP_CRPLUS + repeat_type;
3525     else if (repeat_min == 0 && repeat_max == 1)
3526     *code++ = OP_CRQUERY + repeat_type;
3527     else
3528     {
3529     *code++ = OP_CRRANGE + repeat_type;
3530     PUT2INC(code, 0, repeat_min);
3531     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3532     PUT2INC(code, 0, repeat_max);
3533     }
3534     }
3535    
3536     /* If previous was a bracket group, we may have to replicate it in certain
3537     cases. */
3538    
3539 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3540     *previous == OP_ONCE || *previous == OP_COND)
3541 nigel 77 {
3542     register int i;
3543     int ketoffset = 0;
3544     int len = code - previous;
3545     uschar *bralink = NULL;
3546    
3547 nigel 93 /* Repeating a DEFINE group is pointless */
3548    
3549     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3550     {
3551     *errorcodeptr = ERR55;
3552     goto FAILED;
3553     }
3554    
3555 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3556     by scanning through from the start, and compute the offset back to it
3557     from the current code pointer. There may be an OP_OPT setting following
3558     the final KET, so we can't find the end just by going back from the code
3559     pointer. */
3560    
3561     if (repeat_max == -1)
3562     {
3563     register uschar *ket = previous;
3564     do ket += GET(ket, 1); while (*ket != OP_KET);
3565     ketoffset = code - ket;
3566     }
3567    
3568     /* The case of a zero minimum is special because of the need to stick
3569     OP_BRAZERO in front of it, and because the group appears once in the
3570     data, whereas in other cases it appears the minimum number of times. For
3571     this reason, it is simplest to treat this case separately, as otherwise
3572     the code gets far too messy. There are several special subcases when the
3573     minimum is zero. */
3574    
3575     if (repeat_min == 0)
3576     {
3577     /* If the maximum is also zero, we just omit the group from the output
3578     altogether. */
3579    
3580     if (repeat_max == 0)
3581     {
3582     code = previous;
3583     goto END_REPEAT;
3584     }
3585    
3586     /* If the maximum is 1 or unlimited, we just have to stick in the
3587     BRAZERO and do no more at this point. However, we do need to adjust
3588     any OP_RECURSE calls inside the group that refer to the group itself or
3589 nigel 93 any internal or forward referenced group, because the offset is from
3590     the start of the whole regex. Temporarily terminate the pattern while
3591     doing this. */
3592 nigel 77
3593     if (repeat_max <= 1)
3594     {
3595     *code = OP_END;
3596 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3597 nigel 77 memmove(previous+1, previous, len);
3598     code++;
3599     *previous++ = OP_BRAZERO + repeat_type;
3600     }
3601    
3602     /* If the maximum is greater than 1 and limited, we have to replicate
3603     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3604     The first one has to be handled carefully because it's the original
3605     copy, which has to be moved up. The remainder can be handled by code
3606     that is common with the non-zero minimum case below. We have to
3607     adjust the value or repeat_max, since one less copy is required. Once
3608     again, we may have to adjust any OP_RECURSE calls inside the group. */
3609    
3610     else
3611     {
3612     int offset;
3613     *code = OP_END;
3614 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3615 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3616     code += 2 + LINK_SIZE;
3617     *previous++ = OP_BRAZERO + repeat_type;
3618     *previous++ = OP_BRA;
3619    
3620     /* We chain together the bracket offset fields that have to be
3621     filled in later when the ends of the brackets are reached. */
3622    
3623     offset = (bralink == NULL)? 0 : previous - bralink;
3624     bralink = previous;
3625     PUTINC(previous, 0, offset);
3626     }
3627    
3628     repeat_max--;
3629     }
3630    
3631     /* If the minimum is greater than zero, replicate the group as many
3632     times as necessary, and adjust the maximum to the number of subsequent
3633     copies that we need. If we set a first char from the group, and didn't
3634 nigel 93 set a required char, copy the latter from the former. If there are any
3635     forward reference subroutine calls in the group, there will be entries on
3636     the workspace list; replicate these with an appropriate increment. */
3637 nigel 77
3638     else
3639     {
3640     if (repeat_min > 1)
3641     {
3642 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3643 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3644     potential integer overflow. */
3645 nigel 93
3646     if (lengthptr != NULL)
3647 ph10 202 {
3648     int delta = (repeat_min - 1)*length_prevgroup;
3649     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3650     (double)INT_MAX ||
3651     OFLOW_MAX - *lengthptr < delta)
3652     {
3653     *errorcodeptr = ERR20;
3654     goto FAILED;
3655     }
3656     *lengthptr += delta;
3657     }
3658 nigel 93
3659     /* This is compiling for real */
3660    
3661     else
3662 nigel 77 {
3663 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3664     for (i = 1; i < repeat_min; i++)
3665     {
3666     uschar *hc;
3667     uschar *this_hwm = cd->hwm;
3668     memcpy(code, previous, len);
3669     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3670     {
3671     PUT(cd->hwm, 0, GET(hc, 0) + len);
3672     cd->hwm += LINK_SIZE;
3673     }
3674     save_hwm = this_hwm;
3675     code += len;
3676     }
3677 nigel 77 }
3678     }
3679 nigel 93
3680 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3681     }
3682    
3683     /* This code is common to both the zero and non-zero minimum cases. If
3684     the maximum is limited, it replicates the group in a nested fashion,
3685     remembering the bracket starts on a stack. In the case of a zero minimum,
3686     the first one was set up above. In all cases the repeat_max now specifies
3687 nigel 93 the number of additional copies needed. Again, we must remember to
3688     replicate entries on the forward reference list. */
3689 nigel 77
3690     if (repeat_max >= 0)
3691     {
3692 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3693     just adjust the length as if we had. For each repetition we must add 1
3694     to the length for BRAZERO and for all but the last repetition we must
3695 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3696     paranoid checks to avoid integer overflow. */
3697 nigel 93
3698     if (lengthptr != NULL && repeat_max > 0)
3699 ph10 202 {
3700     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3701     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3702     if ((double)repeat_max *
3703     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3704     > (double)INT_MAX ||
3705     OFLOW_MAX - *lengthptr < delta)
3706     {
3707     *errorcodeptr = ERR20;
3708     goto FAILED;
3709     }
3710     *lengthptr += delta;
3711     }
3712 nigel 93
3713     /* This is compiling for real */
3714    
3715     else for (i = repeat_max - 1; i >= 0; i--)
3716 nigel 77 {
3717 nigel 93 uschar *hc;
3718     uschar *this_hwm = cd->hwm;
3719    
3720 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3721    
3722     /* All but the final copy start a new nesting, maintaining the
3723     chain of brackets outstanding. */
3724    
3725     if (i != 0)
3726     {
3727     int offset;
3728     *code++ = OP_BRA;
3729     offset = (bralink == NULL)? 0 : code - bralink;
3730     bralink = code;
3731     PUTINC(code, 0, offset);
3732     }
3733    
3734     memcpy(code, previous, len);
3735 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3736     {
3737     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3738     cd->hwm += LINK_SIZE;
3739     }
3740     save_hwm = this_hwm;
3741 nigel 77 code += len;
3742     }
3743    
3744     /* Now chain through the pending brackets, and fill in their length
3745     fields (which are holding the chain links pro tem). */
3746    
3747     while (bralink != NULL)
3748     {
3749     int oldlinkoffset;
3750     int offset = code - bralink + 1;
3751     uschar *bra = code - offset;
3752     oldlinkoffset = GET(bra, 1);
3753     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3754     *code++ = OP_KET;
3755     PUTINC(code, 0, offset);
3756     PUT(bra, 1, offset);
3757     }
3758     }
3759    
3760     /* If the maximum is unlimited, set a repeater in the final copy. We
3761     can't just offset backwards from the current code point, because we
3762     don't know if there's been an options resetting after the ket. The
3763 nigel 93 correct offset was computed above.
3764 nigel 77
3765 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3766     this group is a non-atomic one that could match an empty string. If so,
3767     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3768     that runtime checking can be done. [This check is also applied to
3769     atomic groups at runtime, but in a different way.] */
3770    
3771     else
3772     {
3773     uschar *ketcode = code - ketoffset;
3774     uschar *bracode = ketcode - GET(ketcode, 1);
3775     *ketcode = OP_KETRMAX + repeat_type;
3776     if (lengthptr == NULL && *bracode != OP_ONCE)
3777     {
3778     uschar *scode = bracode;
3779     do
3780     {
3781     if (could_be_empty_branch(scode, ketcode, utf8))
3782     {
3783     *bracode += OP_SBRA - OP_BRA;
3784     break;
3785     }
3786     scode += GET(scode, 1);
3787     }
3788     while (*scode == OP_ALT);
3789     }
3790     }
3791 nigel 77 }
3792    
3793     /* Else there's some kind of shambles */
3794    
3795     else
3796     {
3797     *errorcodeptr = ERR11;
3798     goto FAILED;
3799     }
3800    
3801 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3802     tests above succeeded, possessive_quantifier is TRUE. For some of the
3803     simpler opcodes, there is an special alternative opcode for this. For
3804     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3805     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3806     but the special opcodes can optimize it a bit. The repeated item starts at
3807     tempcode, not at previous, which might be the first part of a string whose
3808     (former) last char we repeated.
3809 nigel 77
3810 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3811     an 'upto' may follow. We skip over an 'exact' item, and then test the
3812     length of what remains before proceeding. */
3813    
3814 nigel 77 if (possessive_quantifier)
3815     {
3816 nigel 93 int len;
3817     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3818     *tempcode == OP_NOTEXACT)
3819     tempcode += _pcre_OP_lengths[*tempcode];
3820     len = code - tempcode;
3821     if (len > 0) switch (*tempcode)
3822     {
3823     case OP_STAR: *tempcode = OP_POSSTAR; break;
3824     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3825     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3826     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3827    
3828     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3829     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3830     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3831     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3832    
3833     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3834     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3835     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3836     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3837    
3838     default:
3839     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3840     code += 1 + LINK_SIZE;
3841     len += 1 + LINK_SIZE;
3842     tempcode[0] = OP_ONCE;
3843     *code++ = OP_KET;
3844     PUTINC(code, 0, len);
3845     PUT(tempcode, 1, len);
3846     break;
3847     }
3848 nigel 77 }
3849    
3850     /* In all case we no longer have a previous item. We also set the
3851     "follows varying string" flag for subsequently encountered reqbytes if
3852     it isn't already set and we have just passed a varying length item. */
3853    
3854     END_REPEAT:
3855     previous = NULL;
3856     cd->req_varyopt |= reqvary;
3857     break;
3858    
3859    
3860 nigel 93 /* ===================================================================*/
3861     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3862     lookbehind or option setting or condition or all the other extended
3863     parenthesis forms. First deal with the specials; all are introduced by ?,
3864     and the appearance of any of them means that this is not a capturing
3865     group. */
3866 nigel 77
3867     case '(':
3868     newoptions = options;
3869     skipbytes = 0;
3870 nigel 93 bravalue = OP_CBRA;
3871     save_hwm = cd->hwm;
3872 ph10 180 reset_bracount = FALSE;
3873 nigel 77
3874     if (*(++ptr) == '?')
3875     {
3876 nigel 93 int i, set, unset, namelen;
3877 nigel 77 int *optset;
3878 nigel 93 const uschar *name;
3879     uschar *slot;
3880 nigel 77
3881     switch (*(++ptr))
3882     {
3883     case '#': /* Comment; skip to ket */
3884     ptr++;
3885 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3886     if (*ptr == 0)
3887     {
3888     *errorcodeptr = ERR18;
3889     goto FAILED;
3890     }
3891 nigel 77 continue;
3892    
3893 nigel 93
3894     /* ------------------------------------------------------------ */
3895 ph10 175 case '|': /* Reset capture count for each branch */
3896     reset_bracount = TRUE;
3897 ph10 180 /* Fall through */
3898 ph10 175
3899     /* ------------------------------------------------------------ */
3900 nigel 93 case ':': /* Non-capturing bracket */
3901 nigel 77 bravalue = OP_BRA;
3902     ptr++;
3903     break;
3904    
3905 nigel 93
3906     /* ------------------------------------------------------------ */
3907 nigel 77 case '(':
3908     bravalue = OP_COND; /* Conditional group */
3909    
3910 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3911     group), a name (referring to a named group), or 'R', referring to
3912     recursion. R<digits> and R&name are also permitted for recursion tests.
3913 nigel 77
3914 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3915     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3916    
3917     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3918     be the recursive thing or the name 'R' (and similarly for 'R' followed
3919     by digits), and (b) a number could be a name that consists of digits.
3920     In both cases, we look for a name first; if not found, we try the other
3921     cases. */
3922    
3923     /* For conditions that are assertions, check the syntax, and then exit
3924     the switch. This will take control down to where bracketed groups,
3925     including assertions, are processed. */
3926    
3927     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3928     break;
3929    
3930     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3931     below), and all need to skip 3 bytes at the start of the group. */
3932    
3933     code[1+LINK_SIZE] = OP_CREF;
3934     skipbytes = 3;
3935 ph10 172 refsign = -1;
3936 nigel 93
3937     /* Check for a test for recursion in a named group. */
3938    
3939     if (ptr[1] == 'R' && ptr[2] == '&')
3940 nigel 77 {
3941 nigel 93 terminator = -1;
3942     ptr += 2;
3943     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3944     }
3945 nigel 91
3946 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3947     syntax (?(<name>) or (?('name') */
3948 nigel 91
3949 nigel 93 else if (ptr[1] == '<')
3950     {
3951     terminator = '>';
3952     ptr++;
3953     }
3954     else if (ptr[1] == '\'')
3955     {
3956     terminator = '\'';
3957     ptr++;
3958     }
3959 ph10 172 else
3960 ph10 167 {
3961     terminator = 0;
3962 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3963     }
3964 nigel 77
3965 nigel 93 /* We now expect to read a name; any thing else is an error */
3966 nigel 77
3967 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3968     {
3969     ptr += 1; /* To get the right offset */
3970     *errorcodeptr = ERR28;
3971     goto FAILED;
3972     }
3973    
3974     /* Read the name, but also get it as a number if it's all digits */
3975    
3976     recno = 0;
3977     name = ++ptr;
3978     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3979     {
3980     if (recno >= 0)
3981     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3982     recno * 10 + *ptr - '0' : -1;
3983 nigel 91 ptr++;
3984 nigel 93 }
3985     namelen = ptr - name;
3986 nigel 91
3987 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3988     {
3989     ptr--; /* Error offset */
3990     *errorcodeptr = ERR26;
3991     goto FAILED;
3992     }
3993 nigel 91
3994 nigel 93 /* Do no further checking in the pre-compile phase. */
3995 nigel 91
3996 nigel 93 if (lengthptr != NULL) break;
3997 nigel 91
3998 nigel 93 /* In the real compile we do the work of looking for the actual
3999 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4000     be digits, in which case recno will be set. */
4001 ph10 172
4002 ph10 167 if (refsign > 0)
4003     {
4004     if (recno <= 0)
4005     {
4006     *errorcodeptr = ERR58;
4007     goto FAILED;
4008 ph10 172 }
4009 ph10 167 if (refsign == '-')
4010     {
4011 ph10 172 recno = cd->bracount - recno + 1;
4012 ph10 167 if (recno <= 0)
4013     {
4014     *errorcodeptr = ERR15;
4015     goto FAILED;
4016 ph10 172 }
4017 ph10 167 }
4018 ph10 172 else recno += cd->bracount;
4019 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
4020     break;
4021 ph10 172 }
4022 nigel 91
4023 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
4024     name. */
4025 ph10 172
4026 nigel 93 slot = cd->name_table;
4027     for (i = 0; i < cd->names_found; i++)
4028     {
4029     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4030     slot += cd->name_entry_size;
4031     }
4032 nigel 91
4033 nigel 93 /* Found a previous named subpattern */
4034 nigel 91
4035 nigel 93 if (i < cd->names_found)
4036     {
4037     recno = GET2(slot, 0);
4038     PUT2(code, 2+LINK_SIZE, recno);
4039     }
4040 nigel 91
4041 nigel 93 /* Search the pattern for a forward reference */
4042 nigel 91
4043 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4044     (options & PCRE_EXTENDED) != 0)) > 0)
4045     {
4046     PUT2(code, 2+LINK_SIZE, i);
4047     }
4048 nigel 91
4049 nigel 93 /* If terminator == 0 it means that the name followed directly after
4050     the opening parenthesis [e.g. (?(abc)...] and in this case there are
4051     some further alternatives to try. For the cases where terminator != 0
4052     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4053     now checked all the possibilities, so give an error. */
4054 nigel 91
4055 nigel 93 else if (terminator != 0)
4056     {
4057     *errorcodeptr = ERR15;
4058     goto FAILED;
4059     }
4060    
4061     /* Check for (?(R) for recursion. Allow digits after R to specify a
4062     specific group number. */
4063    
4064     else if (*name == 'R')
4065     {
4066     recno = 0;
4067     for (i = 1; i < namelen; i++)
4068 nigel 91 {
4069 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
4070     {
4071     *errorcodeptr = ERR15;
4072     goto FAILED;
4073     }
4074     recno = recno * 10 + name[i] - '0';
4075 nigel 77 }
4076 nigel 93 if (recno == 0) recno = RREF_ANY;
4077     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4078     PUT2(code, 2+LINK_SIZE, recno);
4079 nigel 77 }
4080 nigel 91
4081 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
4082     false. */
4083 nigel 91
4084 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4085     {
4086     code[1+LINK_SIZE] = OP_DEF;
4087     skipbytes = 1;
4088     }
4089    
4090     /* Check for the "name" actually being a subpattern number. */
4091    
4092     else if (recno > 0)
4093     {
4094     PUT2(code, 2+LINK_SIZE, recno);
4095     }
4096    
4097     /* Either an unidentified subpattern, or a reference to (?(0) */
4098    
4099     else
4100     {
4101     *errorcodeptr = (recno == 0)? ERR35: ERR15;
4102     goto FAILED;
4103     }
4104 nigel 77 break;
4105    
4106 nigel 93
4107     /* ------------------------------------------------------------ */
4108 nigel 77 case '=': /* Positive lookahead */
4109     bravalue = OP_ASSERT;
4110     ptr++;
4111