/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 111 - (hide annotations) (download)
Thu Mar 8 16:53:09 2007 UTC (7 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 172554 byte(s)
Create the PrepareRelease script to process the documentation and create the 
.generic files for distribution, also to remove trailing spaces. Update a lot 
more of the build-time documentation. Arrange for PrepareRelease and its 
sub-scripts to be distributed.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211     "(?R or (?digits must be followed by )",
212     /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245     "\\g is not followed by an (optionally braced) non-zero number"
246 nigel 77 };
247    
248    
249     /* Table to identify digits and hex digits. This is used when compiling
250     patterns. Note that the tables in chartables are dependent on the locale, and
251     may mark arbitrary characters as digits - but the PCRE compiling code expects
252     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
253     a private table here. It costs 256 bytes, but it is a lot faster than doing
254     character value tests (at least in some simple cases I timed), and in some
255     applications one wants PCRE to compile efficiently as well as match
256     efficiently.
257    
258     For convenience, we use the same bit definitions as in chartables:
259    
260     0x04 decimal digit
261     0x08 hexadecimal digit
262    
263     Then we can use ctype_digit and ctype_xdigit in the code. */
264    
265 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
266 nigel 77 static const unsigned char digitab[] =
267     {
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
274     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
275     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
276     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
280     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300    
301 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
302 nigel 77 static const unsigned char digitab[] =
303     {
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
315 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
316 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
320     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
328     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
334     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
335     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
336    
337     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
338     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
339     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
340     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
342     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
346     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
347     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
349 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
350 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
351     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
354     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
355     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
356     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
357     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
358     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
359     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
360     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
361     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
362     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
363     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
364     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
365     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
366     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
367     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
368     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
369     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
370     #endif
371    
372    
373     /* Definition to allow mutual recursion */
374    
375     static BOOL
376 nigel 93 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377     int *, branch_chain *, compile_data *, int *);
378 nigel 77
379    
380    
381     /*************************************************
382     * Handle escapes *
383     *************************************************/
384    
385     /* This function is called when a \ has been encountered. It either returns a
386     positive value for a simple escape such as \n, or a negative value which
387 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
388     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390     ptr is pointing at the \. On exit, it is on the final character of the escape
391     sequence.
392 nigel 77
393     Arguments:
394     ptrptr points to the pattern position pointer
395     errorcodeptr points to the errorcode variable
396     bracount number of previous extracting brackets
397     options the options bits
398     isclass TRUE if inside a character class
399    
400     Returns: zero or positive => a data character
401     negative => a special escape sequence
402     on error, errorptr is set
403     */
404    
405     static int
406     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
407     int options, BOOL isclass)
408     {
409 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
410     const uschar *ptr = *ptrptr + 1;
411 nigel 77 int c, i;
412    
413 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
414     ptr--; /* Set pointer back to the last byte */
415    
416 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
417    
418     if (c == 0) *errorcodeptr = ERR1;
419    
420     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
421     a table. A non-zero result is something that can be returned immediately.
422     Otherwise further processing may be required. */
423    
424 ph10 97 #ifndef EBCDIC /* ASCII coding */
425 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
426     else if ((i = escapes[c - '0']) != 0) c = i;
427    
428 ph10 97 #else /* EBCDIC coding */
429 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
430     else if ((i = escapes[c - 0x48]) != 0) c = i;
431     #endif
432    
433     /* Escapes that need further processing, or are illegal. */
434    
435     else
436     {
437     const uschar *oldptr;
438 nigel 93 BOOL braced, negated;
439    
440 nigel 77 switch (c)
441     {
442     /* A number of Perl escapes are not handled by PCRE. We give an explicit
443     error. */
444    
445     case 'l':
446     case 'L':
447     case 'N':
448     case 'u':
449     case 'U':
450     *errorcodeptr = ERR37;
451     break;
452    
453 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
454     is an absolute backreference. If negative, it is a relative backreference.
455     This is a Perl 5.10 feature. */
456    
457     case 'g':
458     if (ptr[1] == '{')
459     {
460     braced = TRUE;
461     ptr++;
462     }
463     else braced = FALSE;
464    
465     if (ptr[1] == '-')
466     {
467     negated = TRUE;
468     ptr++;
469     }
470     else negated = FALSE;
471    
472     c = 0;
473     while ((digitab[ptr[1]] & ctype_digit) != 0)
474     c = c * 10 + *(++ptr) - '0';
475    
476     if (c == 0 || (braced && *(++ptr) != '}'))
477     {
478     *errorcodeptr = ERR57;
479     return 0;
480     }
481    
482     if (negated)
483     {
484     if (c > bracount)
485     {
486     *errorcodeptr = ERR15;
487     return 0;
488     }
489     c = bracount - (c - 1);
490     }
491    
492     c = -(ESC_REF + c);
493     break;
494    
495 nigel 77 /* The handling of escape sequences consisting of a string of digits
496     starting with one that is not zero is not straightforward. By experiment,
497     the way Perl works seems to be as follows:
498    
499     Outside a character class, the digits are read as a decimal number. If the
500     number is less than 10, or if there are that many previous extracting
501     left brackets, then it is a back reference. Otherwise, up to three octal
502     digits are read to form an escaped byte. Thus \123 is likely to be octal
503     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
504     value is greater than 377, the least significant 8 bits are taken. Inside a
505     character class, \ followed by a digit is always an octal number. */
506    
507     case '1': case '2': case '3': case '4': case '5':
508     case '6': case '7': case '8': case '9':
509    
510     if (!isclass)
511     {
512     oldptr = ptr;
513     c -= '0';
514     while ((digitab[ptr[1]] & ctype_digit) != 0)
515     c = c * 10 + *(++ptr) - '0';
516     if (c < 10 || c <= bracount)
517     {
518     c = -(ESC_REF + c);
519     break;
520     }
521     ptr = oldptr; /* Put the pointer back and fall through */
522     }
523    
524     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
525     generates a binary zero byte and treats the digit as a following literal.
526     Thus we have to pull back the pointer by one. */
527    
528     if ((c = *ptr) >= '8')
529     {
530     ptr--;
531     c = 0;
532     break;
533     }
534    
535     /* \0 always starts an octal number, but we may drop through to here with a
536 nigel 91 larger first octal digit. The original code used just to take the least
537     significant 8 bits of octal numbers (I think this is what early Perls used
538     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539     than 3 octal digits. */
540 nigel 77
541     case '0':
542     c -= '0';
543     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544     c = c * 8 + *(++ptr) - '0';
545 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
546 nigel 77 break;
547    
548 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
549     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
550     treated as a data character. */
551 nigel 77
552     case 'x':
553 nigel 87 if (ptr[1] == '{')
554 nigel 77 {
555     const uschar *pt = ptr + 2;
556 nigel 87 int count = 0;
557    
558 nigel 77 c = 0;
559     while ((digitab[*pt] & ctype_xdigit) != 0)
560     {
561 nigel 87 register int cc = *pt++;
562     if (c == 0 && cc == '0') continue; /* Leading zeroes */
563 nigel 77 count++;
564 nigel 87
565 ph10 97 #ifndef EBCDIC /* ASCII coding */
566 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
567 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568 ph10 97 #else /* EBCDIC coding */
569 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
570 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571 nigel 77 #endif
572     }
573 nigel 87
574 nigel 77 if (*pt == '}')
575     {
576 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
577 nigel 77 ptr = pt;
578     break;
579     }
580 nigel 87
581 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
582     recognize this construct; fall through to the normal \x handling. */
583     }
584    
585 nigel 87 /* Read just a single-byte hex-defined char */
586 nigel 77
587     c = 0;
588     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
589     {
590     int cc; /* Some compilers don't like ++ */
591     cc = *(++ptr); /* in initializers */
592 ph10 97 #ifndef EBCDIC /* ASCII coding */
593 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595 ph10 97 #else /* EBCDIC coding */
596 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
597     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598     #endif
599     }
600     break;
601    
602 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603     This coding is ASCII-specific, but then the whole concept of \cx is
604     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605 nigel 77
606     case 'c':
607     c = *(++ptr);
608     if (c == 0)
609     {
610     *errorcodeptr = ERR2;
611     return 0;
612     }
613    
614 ph10 97 #ifndef EBCDIC /* ASCII coding */
615 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
616     c ^= 0x40;
617 ph10 97 #else /* EBCDIC coding */
618 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
619     c ^= 0xC0;
620     #endif
621     break;
622    
623     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
624     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
625     for Perl compatibility, it is a literal. This code looks a bit odd, but
626     there used to be some cases other than the default, and there may be again
627     in future, so I haven't "optimized" it. */
628    
629     default:
630     if ((options & PCRE_EXTRA) != 0) switch(c)
631     {
632     default:
633     *errorcodeptr = ERR3;
634     break;
635     }
636     break;
637     }
638     }
639    
640     *ptrptr = ptr;
641     return c;
642     }
643    
644    
645    
646     #ifdef SUPPORT_UCP
647     /*************************************************
648     * Handle \P and \p *
649     *************************************************/
650    
651     /* This function is called after \P or \p has been encountered, provided that
652     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
653     pointing at the P or p. On exit, it is pointing at the final character of the
654     escape sequence.
655    
656     Argument:
657     ptrptr points to the pattern position pointer
658     negptr points to a boolean that is set TRUE for negation else FALSE
659 nigel 87 dptr points to an int that is set to the detailed property value
660 nigel 77 errorcodeptr points to the error code variable
661    
662 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
663 nigel 77 */
664    
665     static int
666 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
667 nigel 77 {
668     int c, i, bot, top;
669     const uschar *ptr = *ptrptr;
670 nigel 87 char name[32];
671 nigel 77
672     c = *(++ptr);
673     if (c == 0) goto ERROR_RETURN;
674    
675     *negptr = FALSE;
676    
677 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
678     negation. */
679 nigel 77
680     if (c == '{')
681     {
682     if (ptr[1] == '^')
683     {
684     *negptr = TRUE;
685     ptr++;
686     }
687 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
688 nigel 77 {
689     c = *(++ptr);
690     if (c == 0) goto ERROR_RETURN;
691     if (c == '}') break;
692     name[i] = c;
693     }
694 nigel 87 if (c !='}') goto ERROR_RETURN;
695 nigel 77 name[i] = 0;
696     }
697    
698     /* Otherwise there is just one following character */
699    
700     else
701     {
702     name[0] = c;
703     name[1] = 0;
704     }
705    
706     *ptrptr = ptr;
707    
708     /* Search for a recognized property name using binary chop */
709    
710     bot = 0;
711     top = _pcre_utt_size;
712    
713     while (bot < top)
714     {
715 nigel 87 i = (bot + top) >> 1;
716 nigel 77 c = strcmp(name, _pcre_utt[i].name);
717 nigel 87 if (c == 0)
718     {
719     *dptr = _pcre_utt[i].value;
720     return _pcre_utt[i].type;
721     }
722 nigel 77 if (c > 0) bot = i + 1; else top = i;
723     }
724    
725     *errorcodeptr = ERR47;
726     *ptrptr = ptr;
727     return -1;
728    
729     ERROR_RETURN:
730     *errorcodeptr = ERR46;
731     *ptrptr = ptr;
732     return -1;
733     }
734     #endif
735    
736    
737    
738    
739     /*************************************************
740     * Check for counted repeat *
741     *************************************************/
742    
743     /* This function is called when a '{' is encountered in a place where it might
744     start a quantifier. It looks ahead to see if it really is a quantifier or not.
745     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
746     where the ddds are digits.
747    
748     Arguments:
749     p pointer to the first char after '{'
750    
751     Returns: TRUE or FALSE
752     */
753    
754     static BOOL
755     is_counted_repeat(const uschar *p)
756     {
757     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
758     while ((digitab[*p] & ctype_digit) != 0) p++;
759     if (*p == '}') return TRUE;
760    
761     if (*p++ != ',') return FALSE;
762     if (*p == '}') return TRUE;
763    
764     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
765     while ((digitab[*p] & ctype_digit) != 0) p++;
766    
767     return (*p == '}');
768     }
769    
770    
771    
772     /*************************************************
773     * Read repeat counts *
774     *************************************************/
775    
776     /* Read an item of the form {n,m} and return the values. This is called only
777     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
778     so the syntax is guaranteed to be correct, but we need to check the values.
779    
780     Arguments:
781     p pointer to first char after '{'
782     minp pointer to int for min
783     maxp pointer to int for max
784     returned as -1 if no max
785     errorcodeptr points to error code variable
786    
787     Returns: pointer to '}' on success;
788     current ptr on error, with errorcodeptr set non-zero
789     */
790    
791     static const uschar *
792     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
793     {
794     int min = 0;
795     int max = -1;
796    
797 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
798     an integer overflow. */
799    
800 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
801 nigel 81 if (min < 0 || min > 65535)
802     {
803     *errorcodeptr = ERR5;
804     return p;
805     }
806 nigel 77
807 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
808     Also, max must not be less than min. */
809    
810 nigel 77 if (*p == '}') max = min; else
811     {
812     if (*(++p) != '}')
813     {
814     max = 0;
815     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
816 nigel 81 if (max < 0 || max > 65535)
817     {
818     *errorcodeptr = ERR5;
819     return p;
820     }
821 nigel 77 if (max < min)
822     {
823     *errorcodeptr = ERR4;
824     return p;
825     }
826     }
827     }
828    
829 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
830     '}'. */
831 nigel 77
832 nigel 81 *minp = min;
833     *maxp = max;
834 nigel 77 return p;
835     }
836    
837    
838    
839     /*************************************************
840 nigel 93 * Find forward referenced subpattern *
841 nigel 91 *************************************************/
842    
843 nigel 93 /* This function scans along a pattern's text looking for capturing
844     subpatterns, and counting them. If it finds a named pattern that matches the
845     name it is given, it returns its number. Alternatively, if the name is NULL, it
846     returns when it reaches a given numbered subpattern. This is used for forward
847     references to subpatterns. We know that if (?P< is encountered, the name will
848     be terminated by '>' because that is checked in the first pass.
849 nigel 91
850     Arguments:
851 nigel 93 ptr current position in the pattern
852     count current count of capturing parens so far encountered
853     name name to seek, or NULL if seeking a numbered subpattern
854     lorn name length, or subpattern number if name is NULL
855     xmode TRUE if we are in /x mode
856 nigel 91
857     Returns: the number of the named subpattern, or -1 if not found
858     */
859    
860     static int
861 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862     BOOL xmode)
863 nigel 91 {
864     const uschar *thisname;
865 nigel 93
866 nigel 91 for (; *ptr != 0; ptr++)
867     {
868 nigel 93 int term;
869    
870     /* Skip over backslashed characters and also entire \Q...\E */
871    
872     if (*ptr == '\\')
873     {
874     if (*(++ptr) == 0) return -1;
875     if (*ptr == 'Q') for (;;)
876     {
877     while (*(++ptr) != 0 && *ptr != '\\');
878     if (*ptr == 0) return -1;
879     if (*(++ptr) == 'E') break;
880     }
881     continue;
882     }
883    
884     /* Skip over character classes */
885    
886     if (*ptr == '[')
887     {
888     while (*(++ptr) != ']')
889     {
890     if (*ptr == '\\')
891     {
892     if (*(++ptr) == 0) return -1;
893     if (*ptr == 'Q') for (;;)
894     {
895     while (*(++ptr) != 0 && *ptr != '\\');
896     if (*ptr == 0) return -1;
897     if (*(++ptr) == 'E') break;
898     }
899     continue;
900     }
901     }
902     continue;
903     }
904    
905     /* Skip comments in /x mode */
906    
907     if (xmode && *ptr == '#')
908     {
909     while (*(++ptr) != 0 && *ptr != '\n');
910     if (*ptr == 0) return -1;
911     continue;
912     }
913    
914     /* An opening parens must now be a real metacharacter */
915    
916 nigel 91 if (*ptr != '(') continue;
917 nigel 93 if (ptr[1] != '?')
918     {
919     count++;
920     if (name == NULL && count == lorn) return count;
921     continue;
922     }
923    
924     ptr += 2;
925     if (*ptr == 'P') ptr++; /* Allow optional P */
926    
927     /* We have to disambiguate (?<! and (?<= from (?<name> */
928    
929     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930     *ptr != '\'')
931     continue;
932    
933 nigel 91 count++;
934 nigel 93
935     if (name == NULL && count == lorn) return count;
936     term = *ptr++;
937     if (term == '<') term = '>';
938 nigel 91 thisname = ptr;
939 nigel 93 while (*ptr != term) ptr++;
940     if (name != NULL && lorn == ptr - thisname &&
941     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942 nigel 91 return count;
943     }
944 nigel 93
945 nigel 91 return -1;
946     }
947    
948    
949    
950     /*************************************************
951 nigel 77 * Find first significant op code *
952     *************************************************/
953    
954     /* This is called by several functions that scan a compiled expression looking
955     for a fixed first character, or an anchoring op code etc. It skips over things
956     that do not influence this. For some calls, a change of option is important.
957     For some calls, it makes sense to skip negative forward and all backward
958     assertions, and also the \b assertion; for others it does not.
959    
960     Arguments:
961     code pointer to the start of the group
962     options pointer to external options
963     optbit the option bit whose changing is significant, or
964     zero if none are
965     skipassert TRUE if certain assertions are to be skipped
966    
967     Returns: pointer to the first significant opcode
968     */
969    
970     static const uschar*
971     first_significant_code(const uschar *code, int *options, int optbit,
972     BOOL skipassert)
973     {
974     for (;;)
975     {
976     switch ((int)*code)
977     {
978     case OP_OPT:
979     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
980     *options = (int)code[1];
981     code += 2;
982     break;
983    
984     case OP_ASSERT_NOT:
985     case OP_ASSERTBACK:
986     case OP_ASSERTBACK_NOT:
987     if (!skipassert) return code;
988     do code += GET(code, 1); while (*code == OP_ALT);
989     code += _pcre_OP_lengths[*code];
990     break;
991    
992     case OP_WORD_BOUNDARY:
993     case OP_NOT_WORD_BOUNDARY:
994     if (!skipassert) return code;
995     /* Fall through */
996    
997     case OP_CALLOUT:
998     case OP_CREF:
999 nigel 93 case OP_RREF:
1000     case OP_DEF:
1001 nigel 77 code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     default:
1005     return code;
1006     }
1007     }
1008     /* Control never reaches here */
1009     }
1010    
1011    
1012    
1013    
1014     /*************************************************
1015     * Find the fixed length of a pattern *
1016     *************************************************/
1017    
1018     /* Scan a pattern and compute the fixed length of subject that will match it,
1019     if the length is fixed. This is needed for dealing with backward assertions.
1020     In UTF8 mode, the result is in characters rather than bytes.
1021    
1022     Arguments:
1023     code points to the start of the pattern (the bracket)
1024     options the compiling options
1025    
1026     Returns: the fixed length, or -1 if there is no fixed length,
1027     or -2 if \C was encountered
1028     */
1029    
1030     static int
1031     find_fixedlength(uschar *code, int options)
1032     {
1033     int length = -1;
1034    
1035     register int branchlength = 0;
1036     register uschar *cc = code + 1 + LINK_SIZE;
1037    
1038     /* Scan along the opcodes for this branch. If we get to the end of the
1039     branch, check the length against that of the other branches. */
1040    
1041     for (;;)
1042     {
1043     int d;
1044     register int op = *cc;
1045    
1046     switch (op)
1047     {
1048 nigel 93 case OP_CBRA:
1049 nigel 77 case OP_BRA:
1050     case OP_ONCE:
1051     case OP_COND:
1052 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053 nigel 77 if (d < 0) return d;
1054     branchlength += d;
1055     do cc += GET(cc, 1); while (*cc == OP_ALT);
1056     cc += 1 + LINK_SIZE;
1057     break;
1058    
1059     /* Reached end of a branch; if it's a ket it is the end of a nested
1060     call. If it's ALT it is an alternation in a nested call. If it is
1061     END it's the end of the outer call. All can be handled by the same code. */
1062    
1063     case OP_ALT:
1064     case OP_KET:
1065     case OP_KETRMAX:
1066     case OP_KETRMIN:
1067     case OP_END:
1068     if (length < 0) length = branchlength;
1069     else if (length != branchlength) return -1;
1070     if (*cc != OP_ALT) return length;
1071     cc += 1 + LINK_SIZE;
1072     branchlength = 0;
1073     break;
1074    
1075     /* Skip over assertive subpatterns */
1076    
1077     case OP_ASSERT:
1078     case OP_ASSERT_NOT:
1079     case OP_ASSERTBACK:
1080     case OP_ASSERTBACK_NOT:
1081     do cc += GET(cc, 1); while (*cc == OP_ALT);
1082     /* Fall through */
1083    
1084     /* Skip over things that don't match chars */
1085    
1086     case OP_REVERSE:
1087     case OP_CREF:
1088 nigel 93 case OP_RREF:
1089     case OP_DEF:
1090 nigel 77 case OP_OPT:
1091     case OP_CALLOUT:
1092     case OP_SOD:
1093     case OP_SOM:
1094     case OP_EOD:
1095     case OP_EODN:
1096     case OP_CIRC:
1097     case OP_DOLL:
1098     case OP_NOT_WORD_BOUNDARY:
1099     case OP_WORD_BOUNDARY:
1100     cc += _pcre_OP_lengths[*cc];
1101     break;
1102    
1103     /* Handle literal characters */
1104    
1105     case OP_CHAR:
1106     case OP_CHARNC:
1107 nigel 91 case OP_NOT:
1108 nigel 77 branchlength++;
1109     cc += 2;
1110     #ifdef SUPPORT_UTF8
1111     if ((options & PCRE_UTF8) != 0)
1112     {
1113     while ((*cc & 0xc0) == 0x80) cc++;
1114     }
1115     #endif
1116     break;
1117    
1118     /* Handle exact repetitions. The count is already in characters, but we
1119     need to skip over a multibyte character in UTF8 mode. */
1120    
1121     case OP_EXACT:
1122     branchlength += GET2(cc,1);
1123     cc += 4;
1124     #ifdef SUPPORT_UTF8
1125     if ((options & PCRE_UTF8) != 0)
1126     {
1127     while((*cc & 0x80) == 0x80) cc++;
1128     }
1129     #endif
1130     break;
1131    
1132     case OP_TYPEEXACT:
1133     branchlength += GET2(cc,1);
1134     cc += 4;
1135     break;
1136    
1137     /* Handle single-char matchers */
1138    
1139     case OP_PROP:
1140     case OP_NOTPROP:
1141 nigel 87 cc += 2;
1142 nigel 77 /* Fall through */
1143    
1144     case OP_NOT_DIGIT:
1145     case OP_DIGIT:
1146     case OP_NOT_WHITESPACE:
1147     case OP_WHITESPACE:
1148     case OP_NOT_WORDCHAR:
1149     case OP_WORDCHAR:
1150     case OP_ANY:
1151     branchlength++;
1152     cc++;
1153     break;
1154    
1155     /* The single-byte matcher isn't allowed */
1156    
1157     case OP_ANYBYTE:
1158     return -2;
1159    
1160     /* Check a class for variable quantification */
1161    
1162     #ifdef SUPPORT_UTF8
1163     case OP_XCLASS:
1164     cc += GET(cc, 1) - 33;
1165     /* Fall through */
1166     #endif
1167    
1168     case OP_CLASS:
1169     case OP_NCLASS:
1170     cc += 33;
1171    
1172     switch (*cc)
1173     {
1174     case OP_CRSTAR:
1175     case OP_CRMINSTAR:
1176     case OP_CRQUERY:
1177     case OP_CRMINQUERY:
1178     return -1;
1179    
1180     case OP_CRRANGE:
1181     case OP_CRMINRANGE:
1182     if (GET2(cc,1) != GET2(cc,3)) return -1;
1183     branchlength += GET2(cc,1);
1184     cc += 5;
1185     break;
1186    
1187     default:
1188     branchlength++;
1189     }
1190     break;
1191    
1192     /* Anything else is variable length */
1193    
1194     default:
1195     return -1;
1196     }
1197     }
1198     /* Control never gets here */
1199     }
1200    
1201    
1202    
1203    
1204     /*************************************************
1205     * Scan compiled regex for numbered bracket *
1206     *************************************************/
1207    
1208     /* This little function scans through a compiled pattern until it finds a
1209     capturing bracket with the given number.
1210    
1211     Arguments:
1212     code points to start of expression
1213     utf8 TRUE in UTF-8 mode
1214     number the required bracket number
1215    
1216     Returns: pointer to the opcode for the bracket, or NULL if not found
1217     */
1218    
1219     static const uschar *
1220     find_bracket(const uschar *code, BOOL utf8, int number)
1221     {
1222     for (;;)
1223     {
1224     register int c = *code;
1225     if (c == OP_END) return NULL;
1226 nigel 91
1227     /* XCLASS is used for classes that cannot be represented just by a bit
1228     map. This includes negated single high-valued characters. The length in
1229     the table is zero; the actual length is stored in the compiled code. */
1230    
1231     if (c == OP_XCLASS) code += GET(code, 1);
1232    
1233 nigel 93 /* Handle capturing bracket */
1234 nigel 91
1235 nigel 93 else if (c == OP_CBRA)
1236 nigel 77 {
1237 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1238 nigel 77 if (n == number) return (uschar *)code;
1239 nigel 93 code += _pcre_OP_lengths[c];
1240 nigel 77 }
1241 nigel 91
1242 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243     a multi-byte character. The length in the table is a minimum, so we have to
1244     arrange to skip the extra bytes. */
1245 nigel 91
1246 nigel 77 else
1247     {
1248     code += _pcre_OP_lengths[c];
1249 ph10 107 #ifdef SUPPORT_UTF8
1250 nigel 77 if (utf8) switch(c)
1251     {
1252     case OP_CHAR:
1253     case OP_CHARNC:
1254     case OP_EXACT:
1255     case OP_UPTO:
1256     case OP_MINUPTO:
1257 nigel 93 case OP_POSUPTO:
1258 nigel 77 case OP_STAR:
1259     case OP_MINSTAR:
1260 nigel 93 case OP_POSSTAR:
1261 nigel 77 case OP_PLUS:
1262     case OP_MINPLUS:
1263 nigel 93 case OP_POSPLUS:
1264 nigel 77 case OP_QUERY:
1265     case OP_MINQUERY:
1266 nigel 93 case OP_POSQUERY:
1267     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1268 nigel 77 break;
1269     }
1270 ph10 111 #endif
1271 nigel 77 }
1272     }
1273     }
1274    
1275    
1276    
1277     /*************************************************
1278     * Scan compiled regex for recursion reference *
1279     *************************************************/
1280    
1281     /* This little function scans through a compiled pattern until it finds an
1282     instance of OP_RECURSE.
1283    
1284     Arguments:
1285     code points to start of expression
1286     utf8 TRUE in UTF-8 mode
1287    
1288     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1289     */
1290    
1291     static const uschar *
1292     find_recurse(const uschar *code, BOOL utf8)
1293     {
1294     for (;;)
1295     {
1296     register int c = *code;
1297     if (c == OP_END) return NULL;
1298 nigel 91 if (c == OP_RECURSE) return code;
1299    
1300     /* XCLASS is used for classes that cannot be represented just by a bit
1301     map. This includes negated single high-valued characters. The length in
1302     the table is zero; the actual length is stored in the compiled code. */
1303    
1304     if (c == OP_XCLASS) code += GET(code, 1);
1305    
1306     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307     that are followed by a character may be followed by a multi-byte character.
1308 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1309     bytes. */
1310 nigel 91
1311 nigel 77 else
1312     {
1313     code += _pcre_OP_lengths[c];
1314 ph10 107 #ifdef SUPPORT_UTF8
1315 nigel 77 if (utf8) switch(c)
1316     {
1317     case OP_CHAR:
1318     case OP_CHARNC:
1319     case OP_EXACT:
1320     case OP_UPTO:
1321     case OP_MINUPTO:
1322 nigel 93 case OP_POSUPTO:
1323 nigel 77 case OP_STAR:
1324     case OP_MINSTAR:
1325 nigel 93 case OP_POSSTAR:
1326 nigel 77 case OP_PLUS:
1327     case OP_MINPLUS:
1328 nigel 93 case OP_POSPLUS:
1329 nigel 77 case OP_QUERY:
1330     case OP_MINQUERY:
1331 nigel 93 case OP_POSQUERY:
1332     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1333 nigel 77 break;
1334     }
1335 ph10 111 #endif
1336 nigel 77 }
1337     }
1338     }
1339    
1340    
1341    
1342     /*************************************************
1343     * Scan compiled branch for non-emptiness *
1344     *************************************************/
1345    
1346     /* This function scans through a branch of a compiled pattern to see whether it
1347 nigel 93 can match the empty string or not. It is called from could_be_empty()
1348     below and from compile_branch() when checking for an unlimited repeat of a
1349     group that can match nothing. Note that first_significant_code() skips over
1350     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1351     struck an inner bracket whose current branch will already have been scanned.
1352 nigel 77
1353     Arguments:
1354     code points to start of search
1355     endcode points to where to stop
1356     utf8 TRUE if in UTF8 mode
1357    
1358     Returns: TRUE if what is matched could be empty
1359     */
1360    
1361     static BOOL
1362     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1363     {
1364     register int c;
1365 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1366 nigel 77 code < endcode;
1367     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1368     {
1369     const uschar *ccode;
1370    
1371     c = *code;
1372    
1373 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1374 nigel 77 {
1375     BOOL empty_branch;
1376     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1377    
1378     /* Scan a closed bracket */
1379    
1380     empty_branch = FALSE;
1381     do
1382     {
1383     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1384     empty_branch = TRUE;
1385     code += GET(code, 1);
1386     }
1387     while (*code == OP_ALT);
1388     if (!empty_branch) return FALSE; /* All branches are non-empty */
1389 nigel 93
1390     /* Move past the KET and fudge things so that the increment in the "for"
1391     above has no effect. */
1392    
1393     c = OP_END;
1394     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1395     continue;
1396 nigel 77 }
1397    
1398 nigel 93 /* Handle the other opcodes */
1399    
1400     switch (c)
1401 nigel 77 {
1402     /* Check for quantifiers after a class */
1403    
1404     #ifdef SUPPORT_UTF8
1405     case OP_XCLASS:
1406     ccode = code + GET(code, 1);
1407     goto CHECK_CLASS_REPEAT;
1408     #endif
1409    
1410     case OP_CLASS:
1411     case OP_NCLASS:
1412     ccode = code + 33;
1413    
1414     #ifdef SUPPORT_UTF8
1415     CHECK_CLASS_REPEAT:
1416     #endif
1417    
1418     switch (*ccode)
1419     {
1420     case OP_CRSTAR: /* These could be empty; continue */
1421     case OP_CRMINSTAR:
1422     case OP_CRQUERY:
1423     case OP_CRMINQUERY:
1424     break;
1425    
1426     default: /* Non-repeat => class must match */
1427     case OP_CRPLUS: /* These repeats aren't empty */
1428     case OP_CRMINPLUS:
1429     return FALSE;
1430    
1431     case OP_CRRANGE:
1432     case OP_CRMINRANGE:
1433     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1434     break;
1435     }
1436     break;
1437    
1438     /* Opcodes that must match a character */
1439    
1440     case OP_PROP:
1441     case OP_NOTPROP:
1442     case OP_EXTUNI:
1443     case OP_NOT_DIGIT:
1444     case OP_DIGIT:
1445     case OP_NOT_WHITESPACE:
1446     case OP_WHITESPACE:
1447     case OP_NOT_WORDCHAR:
1448     case OP_WORDCHAR:
1449     case OP_ANY:
1450     case OP_ANYBYTE:
1451     case OP_CHAR:
1452     case OP_CHARNC:
1453     case OP_NOT:
1454     case OP_PLUS:
1455     case OP_MINPLUS:
1456 nigel 93 case OP_POSPLUS:
1457 nigel 77 case OP_EXACT:
1458     case OP_NOTPLUS:
1459     case OP_NOTMINPLUS:
1460 nigel 93 case OP_NOTPOSPLUS:
1461 nigel 77 case OP_NOTEXACT:
1462     case OP_TYPEPLUS:
1463     case OP_TYPEMINPLUS:
1464 nigel 93 case OP_TYPEPOSPLUS:
1465 nigel 77 case OP_TYPEEXACT:
1466     return FALSE;
1467    
1468     /* End of branch */
1469    
1470     case OP_KET:
1471     case OP_KETRMAX:
1472     case OP_KETRMIN:
1473     case OP_ALT:
1474     return TRUE;
1475    
1476 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1477     MINUPTO, and POSUPTO may be followed by a multibyte character */
1478 nigel 77
1479     #ifdef SUPPORT_UTF8
1480     case OP_STAR:
1481     case OP_MINSTAR:
1482 nigel 93 case OP_POSSTAR:
1483 nigel 77 case OP_QUERY:
1484     case OP_MINQUERY:
1485 nigel 93 case OP_POSQUERY:
1486 nigel 77 case OP_UPTO:
1487     case OP_MINUPTO:
1488 nigel 93 case OP_POSUPTO:
1489 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1490     break;
1491     #endif
1492     }
1493     }
1494    
1495     return TRUE;
1496     }
1497    
1498    
1499    
1500     /*************************************************
1501     * Scan compiled regex for non-emptiness *
1502     *************************************************/
1503    
1504     /* This function is called to check for left recursive calls. We want to check
1505     the current branch of the current pattern to see if it could match the empty
1506     string. If it could, we must look outwards for branches at other levels,
1507     stopping when we pass beyond the bracket which is the subject of the recursion.
1508    
1509     Arguments:
1510     code points to start of the recursion
1511     endcode points to where to stop (current RECURSE item)
1512     bcptr points to the chain of current (unclosed) branch starts
1513     utf8 TRUE if in UTF-8 mode
1514    
1515     Returns: TRUE if what is matched could be empty
1516     */
1517    
1518     static BOOL
1519     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1520     BOOL utf8)
1521     {
1522     while (bcptr != NULL && bcptr->current >= code)
1523     {
1524     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1525     bcptr = bcptr->outer;
1526     }
1527     return TRUE;
1528     }
1529    
1530    
1531    
1532     /*************************************************
1533     * Check for POSIX class syntax *
1534     *************************************************/
1535    
1536     /* This function is called when the sequence "[:" or "[." or "[=" is
1537     encountered in a character class. It checks whether this is followed by an
1538     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1539     ".]" or "=]".
1540    
1541     Argument:
1542     ptr pointer to the initial [
1543     endptr where to return the end pointer
1544     cd pointer to compile data
1545    
1546     Returns: TRUE or FALSE
1547     */
1548    
1549     static BOOL
1550     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1551     {
1552     int terminator; /* Don't combine these lines; the Solaris cc */
1553     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1554     if (*(++ptr) == '^') ptr++;
1555     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1556     if (*ptr == terminator && ptr[1] == ']')
1557     {
1558     *endptr = ptr;
1559     return TRUE;
1560     }
1561     return FALSE;
1562     }
1563    
1564    
1565    
1566    
1567     /*************************************************
1568     * Check POSIX class name *
1569     *************************************************/
1570    
1571     /* This function is called to check the name given in a POSIX-style class entry
1572     such as [:alnum:].
1573    
1574     Arguments:
1575     ptr points to the first letter
1576     len the length of the name
1577    
1578     Returns: a value representing the name, or -1 if unknown
1579     */
1580    
1581     static int
1582     check_posix_name(const uschar *ptr, int len)
1583     {
1584     register int yield = 0;
1585     while (posix_name_lengths[yield] != 0)
1586     {
1587     if (len == posix_name_lengths[yield] &&
1588     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1589     yield++;
1590     }
1591     return -1;
1592     }
1593    
1594    
1595     /*************************************************
1596     * Adjust OP_RECURSE items in repeated group *
1597     *************************************************/
1598    
1599     /* OP_RECURSE items contain an offset from the start of the regex to the group
1600     that is referenced. This means that groups can be replicated for fixed
1601     repetition simply by copying (because the recursion is allowed to refer to
1602     earlier groups that are outside the current group). However, when a group is
1603     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1604     it, after it has been compiled. This means that any OP_RECURSE items within it
1605     that refer to the group itself or any contained groups have to have their
1606 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1607     the partially compiled regex must be temporarily terminated with OP_END.
1608 nigel 77
1609 nigel 93 This function has been extended with the possibility of forward references for
1610     recursions and subroutine calls. It must also check the list of such references
1611     for the group we are dealing with. If it finds that one of the recursions in
1612     the current group is on this list, it adjusts the offset in the list, not the
1613     value in the reference (which is a group number).
1614    
1615 nigel 77 Arguments:
1616     group points to the start of the group
1617     adjust the amount by which the group is to be moved
1618     utf8 TRUE in UTF-8 mode
1619     cd contains pointers to tables etc.
1620 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1621 nigel 77
1622     Returns: nothing
1623     */
1624    
1625     static void
1626 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1627     uschar *save_hwm)
1628 nigel 77 {
1629     uschar *ptr = group;
1630     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1631     {
1632 nigel 93 int offset;
1633     uschar *hc;
1634    
1635     /* See if this recursion is on the forward reference list. If so, adjust the
1636     reference. */
1637    
1638     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1639     {
1640     offset = GET(hc, 0);
1641     if (cd->start_code + offset == ptr + 1)
1642     {
1643     PUT(hc, 0, offset + adjust);
1644     break;
1645     }
1646     }
1647    
1648     /* Otherwise, adjust the recursion offset if it's after the start of this
1649     group. */
1650    
1651     if (hc >= cd->hwm)
1652     {
1653     offset = GET(ptr, 1);
1654     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1655     }
1656    
1657 nigel 77 ptr += 1 + LINK_SIZE;
1658     }
1659     }
1660    
1661    
1662    
1663     /*************************************************
1664     * Insert an automatic callout point *
1665     *************************************************/
1666    
1667     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1668     callout points before each pattern item.
1669    
1670     Arguments:
1671     code current code pointer
1672     ptr current pattern pointer
1673     cd pointers to tables etc
1674    
1675     Returns: new code pointer
1676     */
1677    
1678     static uschar *
1679     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1680     {
1681     *code++ = OP_CALLOUT;
1682     *code++ = 255;
1683     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1684     PUT(code, LINK_SIZE, 0); /* Default length */
1685     return code + 2*LINK_SIZE;
1686     }
1687    
1688    
1689    
1690     /*************************************************
1691     * Complete a callout item *
1692     *************************************************/
1693    
1694     /* A callout item contains the length of the next item in the pattern, which
1695     we can't fill in till after we have reached the relevant point. This is used
1696     for both automatic and manual callouts.
1697    
1698     Arguments:
1699     previous_callout points to previous callout item
1700     ptr current pattern pointer
1701     cd pointers to tables etc
1702    
1703     Returns: nothing
1704     */
1705    
1706     static void
1707     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1708     {
1709     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1710     PUT(previous_callout, 2 + LINK_SIZE, length);
1711     }
1712    
1713    
1714    
1715     #ifdef SUPPORT_UCP
1716     /*************************************************
1717     * Get othercase range *
1718     *************************************************/
1719    
1720     /* This function is passed the start and end of a class range, in UTF-8 mode
1721     with UCP support. It searches up the characters, looking for internal ranges of
1722     characters in the "other" case. Each call returns the next one, updating the
1723     start address.
1724    
1725     Arguments:
1726     cptr points to starting character value; updated
1727     d end value
1728     ocptr where to put start of othercase range
1729     odptr where to put end of othercase range
1730    
1731     Yield: TRUE when range returned; FALSE when no more
1732     */
1733    
1734     static BOOL
1735 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1736     unsigned int *odptr)
1737 nigel 77 {
1738 nigel 93 unsigned int c, othercase, next;
1739 nigel 77
1740     for (c = *cptr; c <= d; c++)
1741 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1742 nigel 77
1743     if (c > d) return FALSE;
1744    
1745     *ocptr = othercase;
1746     next = othercase + 1;
1747    
1748     for (++c; c <= d; c++)
1749     {
1750 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1751 nigel 77 next++;
1752     }
1753    
1754     *odptr = next - 1;
1755     *cptr = c;
1756    
1757     return TRUE;
1758     }
1759     #endif /* SUPPORT_UCP */
1760    
1761    
1762 nigel 93
1763 nigel 77 /*************************************************
1764 nigel 93 * Check if auto-possessifying is possible *
1765     *************************************************/
1766    
1767     /* This function is called for unlimited repeats of certain items, to see
1768     whether the next thing could possibly match the repeated item. If not, it makes
1769     sense to automatically possessify the repeated item.
1770    
1771     Arguments:
1772     op_code the repeated op code
1773     this data for this item, depends on the opcode
1774     utf8 TRUE in UTF-8 mode
1775     utf8_char used for utf8 character bytes, NULL if not relevant
1776     ptr next character in pattern
1777     options options bits
1778     cd contains pointers to tables etc.
1779    
1780     Returns: TRUE if possessifying is wanted
1781     */
1782    
1783     static BOOL
1784     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1785     const uschar *ptr, int options, compile_data *cd)
1786     {
1787     int next;
1788    
1789     /* Skip whitespace and comments in extended mode */
1790    
1791     if ((options & PCRE_EXTENDED) != 0)
1792     {
1793     for (;;)
1794     {
1795     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1796     if (*ptr == '#')
1797     {
1798     while (*(++ptr) != 0)
1799     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1800     }
1801     else break;
1802     }
1803     }
1804    
1805     /* If the next item is one that we can handle, get its value. A non-negative
1806     value is a character, a negative value is an escape value. */
1807    
1808     if (*ptr == '\\')
1809     {
1810     int temperrorcode = 0;
1811     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1812     if (temperrorcode != 0) return FALSE;
1813     ptr++; /* Point after the escape sequence */
1814     }
1815    
1816     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1817     {
1818     #ifdef SUPPORT_UTF8
1819     if (utf8) { GETCHARINC(next, ptr); } else
1820     #endif
1821     next = *ptr++;
1822     }
1823    
1824     else return FALSE;
1825    
1826     /* Skip whitespace and comments in extended mode */
1827    
1828     if ((options & PCRE_EXTENDED) != 0)
1829     {
1830     for (;;)
1831     {
1832     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1833     if (*ptr == '#')
1834     {
1835     while (*(++ptr) != 0)
1836     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1837     }
1838     else break;
1839     }
1840     }
1841    
1842     /* If the next thing is itself optional, we have to give up. */
1843    
1844     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1845     return FALSE;
1846    
1847     /* Now compare the next item with the previous opcode. If the previous is a
1848     positive single character match, "item" either contains the character or, if
1849     "item" is greater than 127 in utf8 mode, the character's bytes are in
1850     utf8_char. */
1851    
1852    
1853     /* Handle cases when the next item is a character. */
1854    
1855     if (next >= 0) switch(op_code)
1856     {
1857     case OP_CHAR:
1858     #ifdef SUPPORT_UTF8
1859     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1860     #endif
1861     return item != next;
1862    
1863     /* For CHARNC (caseless character) we must check the other case. If we have
1864     Unicode property support, we can use it to test the other case of
1865     high-valued characters. */
1866    
1867     case OP_CHARNC:
1868     #ifdef SUPPORT_UTF8
1869     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1870     #endif
1871     if (item == next) return FALSE;
1872     #ifdef SUPPORT_UTF8
1873     if (utf8)
1874     {
1875     unsigned int othercase;
1876     if (next < 128) othercase = cd->fcc[next]; else
1877     #ifdef SUPPORT_UCP
1878     othercase = _pcre_ucp_othercase((unsigned int)next);
1879     #else
1880     othercase = NOTACHAR;
1881     #endif
1882     return (unsigned int)item != othercase;
1883     }
1884     else
1885     #endif /* SUPPORT_UTF8 */
1886     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1887    
1888     /* For OP_NOT, "item" must be a single-byte character. */
1889    
1890     case OP_NOT:
1891     if (next < 0) return FALSE; /* Not a character */
1892     if (item == next) return TRUE;
1893     if ((options & PCRE_CASELESS) == 0) return FALSE;
1894     #ifdef SUPPORT_UTF8
1895     if (utf8)
1896     {
1897     unsigned int othercase;
1898     if (next < 128) othercase = cd->fcc[next]; else
1899     #ifdef SUPPORT_UCP
1900     othercase = _pcre_ucp_othercase(next);
1901     #else
1902     othercase = NOTACHAR;
1903     #endif
1904     return (unsigned int)item == othercase;
1905     }
1906     else
1907     #endif /* SUPPORT_UTF8 */
1908     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1909    
1910     case OP_DIGIT:
1911     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1912    
1913     case OP_NOT_DIGIT:
1914     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1915    
1916     case OP_WHITESPACE:
1917     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1918    
1919     case OP_NOT_WHITESPACE:
1920     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1921    
1922     case OP_WORDCHAR:
1923     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1924    
1925     case OP_NOT_WORDCHAR:
1926     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1927    
1928     default:
1929     return FALSE;
1930     }
1931    
1932    
1933     /* Handle the case when the next item is \d, \s, etc. */
1934    
1935     switch(op_code)
1936     {
1937     case OP_CHAR:
1938     case OP_CHARNC:
1939     #ifdef SUPPORT_UTF8
1940     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1941     #endif
1942     switch(-next)
1943     {
1944     case ESC_d:
1945     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1946    
1947     case ESC_D:
1948     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1949    
1950     case ESC_s:
1951     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1952    
1953     case ESC_S:
1954     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1955    
1956     case ESC_w:
1957     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1958    
1959     case ESC_W:
1960     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1961    
1962     default:
1963     return FALSE;
1964     }
1965    
1966     case OP_DIGIT:
1967     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1968    
1969     case OP_NOT_DIGIT:
1970     return next == -ESC_d;
1971    
1972     case OP_WHITESPACE:
1973     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1974    
1975     case OP_NOT_WHITESPACE:
1976     return next == -ESC_s;
1977    
1978     case OP_WORDCHAR:
1979     return next == -ESC_W || next == -ESC_s;
1980    
1981     case OP_NOT_WORDCHAR:
1982     return next == -ESC_w || next == -ESC_d;
1983    
1984     default:
1985     return FALSE;
1986     }
1987    
1988     /* Control does not reach here */
1989     }
1990    
1991    
1992    
1993     /*************************************************
1994 nigel 77 * Compile one branch *
1995     *************************************************/
1996    
1997 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
1998 nigel 77 changed during the branch, the pointer is used to change the external options
1999 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2000     to find out the amount of memory needed, as well as during the real compile
2001     phase. The value of lengthptr distinguishes the two phases.
2002 nigel 77
2003     Arguments:
2004     optionsptr pointer to the option bits
2005     codeptr points to the pointer to the current code point
2006     ptrptr points to the current pattern pointer
2007     errorcodeptr points to error code variable
2008     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2009     reqbyteptr set to the last literal character required, else < 0
2010     bcptr points to current branch chain
2011     cd contains pointers to tables etc.
2012 nigel 93 lengthptr NULL during the real compile phase
2013     points to length accumulator during pre-compile phase
2014 nigel 77
2015     Returns: TRUE on success
2016     FALSE, with *errorcodeptr set non-zero on error
2017     */
2018    
2019     static BOOL
2020 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2021     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2022     compile_data *cd, int *lengthptr)
2023 nigel 77 {
2024     int repeat_type, op_type;
2025     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2026     int bravalue = 0;
2027     int greedy_default, greedy_non_default;
2028     int firstbyte, reqbyte;
2029     int zeroreqbyte, zerofirstbyte;
2030     int req_caseopt, reqvary, tempreqvary;
2031     int options = *optionsptr;
2032     int after_manual_callout = 0;
2033 nigel 93 int length_prevgroup = 0;
2034 nigel 77 register int c;
2035     register uschar *code = *codeptr;
2036 nigel 93 uschar *last_code = code;
2037     uschar *orig_code = code;
2038 nigel 77 uschar *tempcode;
2039     BOOL inescq = FALSE;
2040     BOOL groupsetfirstbyte = FALSE;
2041     const uschar *ptr = *ptrptr;
2042     const uschar *tempptr;
2043     uschar *previous = NULL;
2044     uschar *previous_callout = NULL;
2045 nigel 93 uschar *save_hwm = NULL;
2046 nigel 77 uschar classbits[32];
2047    
2048     #ifdef SUPPORT_UTF8
2049     BOOL class_utf8;
2050     BOOL utf8 = (options & PCRE_UTF8) != 0;
2051     uschar *class_utf8data;
2052     uschar utf8_char[6];
2053     #else
2054     BOOL utf8 = FALSE;
2055 nigel 93 uschar *utf8_char = NULL;
2056 nigel 77 #endif
2057    
2058 nigel 93 #ifdef DEBUG
2059     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2060     #endif
2061    
2062 nigel 77 /* Set up the default and non-default settings for greediness */
2063    
2064     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2065     greedy_non_default = greedy_default ^ 1;
2066    
2067     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2068     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2069     matches a non-fixed char first char; reqbyte just remains unset if we never
2070     find one.
2071    
2072     When we hit a repeat whose minimum is zero, we may have to adjust these values
2073     to take the zero repeat into account. This is implemented by setting them to
2074     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2075     item types that can be repeated set these backoff variables appropriately. */
2076    
2077     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2078    
2079     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2080     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2081     value > 255. It is added into the firstbyte or reqbyte variables to record the
2082     case status of the value. This is used only for ASCII characters. */
2083    
2084     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2085    
2086     /* Switch on next character until the end of the branch */
2087    
2088     for (;; ptr++)
2089     {
2090     BOOL negate_class;
2091     BOOL possessive_quantifier;
2092     BOOL is_quantifier;
2093 nigel 93 BOOL is_recurse;
2094 nigel 77 int class_charcount;
2095     int class_lastchar;
2096     int newoptions;
2097     int recno;
2098     int skipbytes;
2099     int subreqbyte;
2100     int subfirstbyte;
2101 nigel 93 int terminator;
2102 nigel 77 int mclength;
2103     uschar mcbuffer[8];
2104    
2105 nigel 93 /* Get next byte in the pattern */
2106 nigel 77
2107     c = *ptr;
2108    
2109 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2110     previous cycle of this loop. */
2111    
2112     if (lengthptr != NULL)
2113     {
2114     #ifdef DEBUG
2115     if (code > cd->hwm) cd->hwm = code; /* High water info */
2116     #endif
2117     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2118     {
2119     *errorcodeptr = ERR52;
2120     goto FAILED;
2121     }
2122    
2123     /* There is at least one situation where code goes backwards: this is the
2124     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2125     the class is simply eliminated. However, it is created first, so we have to
2126     allow memory for it. Therefore, don't ever reduce the length at this point.
2127     */
2128    
2129     if (code < last_code) code = last_code;
2130     *lengthptr += code - last_code;
2131     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2132    
2133     /* If "previous" is set and it is not at the start of the work space, move
2134     it back to there, in order to avoid filling up the work space. Otherwise,
2135     if "previous" is NULL, reset the current code pointer to the start. */
2136    
2137     if (previous != NULL)
2138     {
2139     if (previous > orig_code)
2140     {
2141     memmove(orig_code, previous, code - previous);
2142     code -= previous - orig_code;
2143     previous = orig_code;
2144     }
2145     }
2146     else code = orig_code;
2147    
2148     /* Remember where this code item starts so we can pick up the length
2149     next time round. */
2150    
2151     last_code = code;
2152     }
2153    
2154     /* In the real compile phase, just check the workspace used by the forward
2155     reference list. */
2156    
2157     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2158     {
2159     *errorcodeptr = ERR52;
2160     goto FAILED;
2161     }
2162    
2163 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2164    
2165     if (inescq && c != 0)
2166     {
2167     if (c == '\\' && ptr[1] == 'E')
2168     {
2169     inescq = FALSE;
2170     ptr++;
2171     continue;
2172     }
2173     else
2174     {
2175     if (previous_callout != NULL)
2176     {
2177 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2178     complete_callout(previous_callout, ptr, cd);
2179 nigel 77 previous_callout = NULL;
2180     }
2181     if ((options & PCRE_AUTO_CALLOUT) != 0)
2182     {
2183     previous_callout = code;
2184     code = auto_callout(code, ptr, cd);
2185     }
2186     goto NORMAL_CHAR;
2187     }
2188     }
2189    
2190     /* Fill in length of a previous callout, except when the next thing is
2191     a quantifier. */
2192    
2193     is_quantifier = c == '*' || c == '+' || c == '?' ||
2194     (c == '{' && is_counted_repeat(ptr+1));
2195    
2196     if (!is_quantifier && previous_callout != NULL &&
2197     after_manual_callout-- <= 0)
2198     {
2199 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2200     complete_callout(previous_callout, ptr, cd);
2201 nigel 77 previous_callout = NULL;
2202     }
2203    
2204     /* In extended mode, skip white space and comments */
2205    
2206     if ((options & PCRE_EXTENDED) != 0)
2207     {
2208     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2209     if (c == '#')
2210     {
2211 nigel 93 while (*(++ptr) != 0)
2212 nigel 91 {
2213 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2214 nigel 91 }
2215 nigel 93 if (*ptr != 0) continue;
2216    
2217 nigel 91 /* Else fall through to handle end of string */
2218     c = 0;
2219 nigel 77 }
2220     }
2221    
2222     /* No auto callout for quantifiers. */
2223    
2224     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2225     {
2226     previous_callout = code;
2227     code = auto_callout(code, ptr, cd);
2228     }
2229    
2230     switch(c)
2231     {
2232 nigel 93 /* ===================================================================*/
2233     case 0: /* The branch terminates at string end */
2234     case '|': /* or | or ) */
2235 nigel 77 case ')':
2236     *firstbyteptr = firstbyte;
2237     *reqbyteptr = reqbyte;
2238     *codeptr = code;
2239     *ptrptr = ptr;
2240 nigel 93 if (lengthptr != NULL)
2241     {
2242     *lengthptr += code - last_code; /* To include callout length */
2243     DPRINTF((">> end branch\n"));
2244     }
2245 nigel 77 return TRUE;
2246    
2247 nigel 93
2248     /* ===================================================================*/
2249 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2250     the setting of any following char as a first character. */
2251    
2252     case '^':
2253     if ((options & PCRE_MULTILINE) != 0)
2254     {
2255     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2256     }
2257     previous = NULL;
2258     *code++ = OP_CIRC;
2259     break;
2260    
2261     case '$':
2262     previous = NULL;
2263     *code++ = OP_DOLL;
2264     break;
2265    
2266     /* There can never be a first char if '.' is first, whatever happens about
2267     repeats. The value of reqbyte doesn't change either. */
2268    
2269     case '.':
2270     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2271     zerofirstbyte = firstbyte;
2272     zeroreqbyte = reqbyte;
2273     previous = code;
2274     *code++ = OP_ANY;
2275     break;
2276    
2277 nigel 93
2278     /* ===================================================================*/
2279 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2280     32-byte bitmap of the permitted characters, except in the special case
2281     where there is only one such character. For negated classes, we build the
2282     map as usual, then invert it at the end. However, we use a different opcode
2283     so that data characters > 255 can be handled correctly.
2284 nigel 77
2285     If the class contains characters outside the 0-255 range, a different
2286     opcode is compiled. It may optionally have a bit map for characters < 256,
2287     but those above are are explicitly listed afterwards. A flag byte tells
2288     whether the bitmap is present, and whether this is a negated class or not.
2289     */
2290    
2291     case '[':
2292     previous = code;
2293    
2294     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2295     they are encountered at the top level, so we'll do that too. */
2296    
2297     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2298     check_posix_syntax(ptr, &tempptr, cd))
2299     {
2300     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2301     goto FAILED;
2302     }
2303    
2304     /* If the first character is '^', set the negation flag and skip it. */
2305    
2306     if ((c = *(++ptr)) == '^')
2307     {
2308     negate_class = TRUE;
2309     c = *(++ptr);
2310     }
2311     else
2312     {
2313     negate_class = FALSE;
2314     }
2315    
2316     /* Keep a count of chars with values < 256 so that we can optimize the case
2317 nigel 93 of just a single character (as long as it's < 256). However, For higher
2318     valued UTF-8 characters, we don't yet do any optimization. */
2319 nigel 77
2320     class_charcount = 0;
2321     class_lastchar = -1;
2322    
2323 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2324     temporary bit of memory, in case the class contains only 1 character (less
2325     than 256), because in that case the compiled code doesn't use the bit map.
2326     */
2327    
2328     memset(classbits, 0, 32 * sizeof(uschar));
2329    
2330 nigel 77 #ifdef SUPPORT_UTF8
2331     class_utf8 = FALSE; /* No chars >= 256 */
2332 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2333 nigel 77 #endif
2334    
2335     /* Process characters until ] is reached. By writing this as a "do" it
2336 nigel 93 means that an initial ] is taken as a data character. At the start of the
2337     loop, c contains the first byte of the character. */
2338 nigel 77
2339 nigel 93 if (c != 0) do
2340 nigel 77 {
2341 nigel 93 const uschar *oldptr;
2342    
2343 nigel 77 #ifdef SUPPORT_UTF8
2344     if (utf8 && c > 127)
2345     { /* Braces are required because the */
2346     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2347     }
2348     #endif
2349    
2350     /* Inside \Q...\E everything is literal except \E */
2351    
2352     if (inescq)
2353     {
2354 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2355 nigel 77 {
2356 nigel 93 inescq = FALSE; /* Reset literal state */
2357     ptr++; /* Skip the 'E' */
2358     continue; /* Carry on with next */
2359 nigel 77 }
2360 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2361 nigel 77 }
2362    
2363     /* Handle POSIX class names. Perl allows a negation extension of the
2364     form [:^name:]. A square bracket that doesn't match the syntax is
2365     treated as a literal. We also recognize the POSIX constructions
2366     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2367     5.6 and 5.8 do. */
2368    
2369     if (c == '[' &&
2370     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2371     check_posix_syntax(ptr, &tempptr, cd))
2372     {
2373     BOOL local_negate = FALSE;
2374 nigel 87 int posix_class, taboffset, tabopt;
2375 nigel 77 register const uschar *cbits = cd->cbits;
2376 nigel 87 uschar pbits[32];
2377 nigel 77
2378     if (ptr[1] != ':')
2379     {
2380     *errorcodeptr = ERR31;
2381     goto FAILED;
2382     }
2383    
2384     ptr += 2;
2385     if (*ptr == '^')
2386     {
2387     local_negate = TRUE;
2388     ptr++;
2389     }
2390    
2391     posix_class = check_posix_name(ptr, tempptr - ptr);
2392     if (posix_class < 0)
2393     {
2394     *errorcodeptr = ERR30;
2395     goto FAILED;
2396     }
2397    
2398     /* If matching is caseless, upper and lower are converted to
2399     alpha. This relies on the fact that the class table starts with
2400     alpha, lower, upper as the first 3 entries. */
2401    
2402     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2403     posix_class = 0;
2404    
2405 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2406     because we may be adding and subtracting from it, and we don't want to
2407     subtract bits that may be in the main map already. At the end we or the
2408     result into the bit map that is being built. */
2409 nigel 77
2410     posix_class *= 3;
2411 nigel 87
2412     /* Copy in the first table (always present) */
2413    
2414     memcpy(pbits, cbits + posix_class_maps[posix_class],
2415     32 * sizeof(uschar));
2416    
2417     /* If there is a second table, add or remove it as required. */
2418    
2419     taboffset = posix_class_maps[posix_class + 1];
2420     tabopt = posix_class_maps[posix_class + 2];
2421    
2422     if (taboffset >= 0)
2423 nigel 77 {
2424 nigel 87 if (tabopt >= 0)
2425     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2426 nigel 77 else
2427 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2428 nigel 77 }
2429    
2430 nigel 87 /* Not see if we need to remove any special characters. An option
2431     value of 1 removes vertical space and 2 removes underscore. */
2432    
2433     if (tabopt < 0) tabopt = -tabopt;
2434     if (tabopt == 1) pbits[1] &= ~0x3c;
2435     else if (tabopt == 2) pbits[11] &= 0x7f;
2436    
2437     /* Add the POSIX table or its complement into the main table that is
2438     being built and we are done. */
2439    
2440     if (local_negate)
2441     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2442     else
2443     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2444    
2445 nigel 77 ptr = tempptr + 1;
2446     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2447     continue; /* End of POSIX syntax handling */
2448     }
2449    
2450     /* Backslash may introduce a single character, or it may introduce one
2451 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2452     case. Inside a class (and only there) it is treated as backspace.
2453     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2454     to or into the one we are building. We assume they have more than one
2455 nigel 77 character in them, so set class_charcount bigger than one. */
2456    
2457     if (c == '\\')
2458     {
2459 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2460     if (*errorcodeptr != 0) goto FAILED;
2461 nigel 77
2462     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2463     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2464 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2465 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2466     {
2467     if (ptr[1] == '\\' && ptr[2] == 'E')
2468     {
2469     ptr += 2; /* avoid empty string */
2470     }
2471     else inescq = TRUE;
2472     continue;
2473     }
2474    
2475     if (c < 0)
2476     {
2477     register const uschar *cbits = cd->cbits;
2478     class_charcount += 2; /* Greater than 1 is what matters */
2479 nigel 93
2480     /* Save time by not doing this in the pre-compile phase. */
2481    
2482     if (lengthptr == NULL) switch (-c)
2483 nigel 77 {
2484     case ESC_d:
2485     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2486     continue;
2487    
2488     case ESC_D:
2489     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2490     continue;
2491    
2492     case ESC_w:
2493     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2494     continue;
2495    
2496     case ESC_W:
2497     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2498     continue;
2499    
2500     case ESC_s:
2501     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2502     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2503     continue;
2504    
2505     case ESC_S:
2506     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2507     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2508     continue;
2509    
2510 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2511     continue;
2512    
2513     default: /* Not recognized; fall through */
2514     break; /* Need "default" setting to stop compiler warning. */
2515     }
2516    
2517     /* In the pre-compile phase, just do the recognition. */
2518    
2519     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2520     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2521    
2522     /* We need to deal with \P and \p in both phases. */
2523    
2524 nigel 77 #ifdef SUPPORT_UCP
2525 nigel 93 if (-c == ESC_p || -c == ESC_P)
2526     {
2527     BOOL negated;
2528     int pdata;
2529     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2530     if (ptype < 0) goto FAILED;
2531     class_utf8 = TRUE;
2532     *class_utf8data++ = ((-c == ESC_p) != negated)?
2533     XCL_PROP : XCL_NOTPROP;
2534     *class_utf8data++ = ptype;
2535     *class_utf8data++ = pdata;
2536     class_charcount -= 2; /* Not a < 256 character */
2537 nigel 77 continue;
2538 nigel 93 }
2539 nigel 77 #endif
2540 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2541     strict mode. By default, for compatibility with Perl, they are
2542     treated as literals. */
2543 nigel 77
2544 nigel 93 if ((options & PCRE_EXTRA) != 0)
2545     {
2546     *errorcodeptr = ERR7;
2547     goto FAILED;
2548     }
2549 nigel 77
2550 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2551     c = *ptr; /* Get the final character and fall through */
2552 nigel 77 }
2553    
2554     /* Fall through if we have a single character (c >= 0). This may be
2555 nigel 93 greater than 256 in UTF-8 mode. */
2556 nigel 77
2557     } /* End of backslash handling */
2558    
2559     /* A single character may be followed by '-' to form a range. However,
2560     Perl does not permit ']' to be the end of the range. A '-' character
2561 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2562     entirely. The code for handling \Q and \E is messy. */
2563 nigel 77
2564 nigel 93 CHECK_RANGE:
2565     while (ptr[1] == '\\' && ptr[2] == 'E')
2566 nigel 77 {
2567 nigel 93 inescq = FALSE;
2568     ptr += 2;
2569     }
2570    
2571     oldptr = ptr;
2572    
2573     if (!inescq && ptr[1] == '-')
2574     {
2575 nigel 77 int d;
2576     ptr += 2;
2577 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2578 nigel 77
2579 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2580     mode. */
2581    
2582     while (*ptr == '\\' && ptr[1] == 'Q')
2583     {
2584     ptr += 2;
2585     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2586     inescq = TRUE;
2587     break;
2588     }
2589    
2590     if (*ptr == 0 || (!inescq && *ptr == ']'))
2591     {
2592     ptr = oldptr;
2593     goto LONE_SINGLE_CHARACTER;
2594     }
2595    
2596 nigel 77 #ifdef SUPPORT_UTF8
2597     if (utf8)
2598     { /* Braces are required because the */
2599     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2600     }
2601     else
2602     #endif
2603     d = *ptr; /* Not UTF-8 mode */
2604    
2605     /* The second part of a range can be a single-character escape, but
2606     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2607     in such circumstances. */
2608    
2609 nigel 93 if (!inescq && d == '\\')
2610 nigel 77 {
2611 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612     if (*errorcodeptr != 0) goto FAILED;
2613 nigel 77
2614 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2615     special means the '-' was literal */
2616 nigel 77
2617     if (d < 0)
2618     {
2619     if (d == -ESC_b) d = '\b';
2620 nigel 93 else if (d == -ESC_X) d = 'X';
2621     else if (d == -ESC_R) d = 'R'; else
2622 nigel 77 {
2623 nigel 93 ptr = oldptr;
2624 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2625     }
2626     }
2627     }
2628    
2629 nigel 93 /* Check that the two values are in the correct order. Optimize
2630     one-character ranges */
2631 nigel 77
2632 nigel 93 if (d < c)
2633     {
2634     *errorcodeptr = ERR8;
2635     goto FAILED;
2636     }
2637    
2638 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2639    
2640     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2641     matching, we have to use an XCLASS with extra data items. Caseless
2642     matching for characters > 127 is available only if UCP support is
2643     available. */
2644    
2645     #ifdef SUPPORT_UTF8
2646     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2647     {
2648     class_utf8 = TRUE;
2649    
2650     /* With UCP support, we can find the other case equivalents of
2651     the relevant characters. There may be several ranges. Optimize how
2652     they fit with the basic range. */
2653    
2654     #ifdef SUPPORT_UCP
2655     if ((options & PCRE_CASELESS) != 0)
2656     {
2657 nigel 93 unsigned int occ, ocd;
2658     unsigned int cc = c;
2659     unsigned int origd = d;
2660 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2661     {
2662     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2663    
2664     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2665     { /* if there is overlap, */
2666     c = occ; /* noting that if occ < c */
2667     continue; /* we can't have ocd > d */
2668     } /* because a subrange is */
2669     if (ocd > d && occ <= d + 1) /* always shorter than */
2670     { /* the basic range. */
2671     d = ocd;
2672     continue;
2673     }
2674    
2675     if (occ == ocd)
2676     {
2677     *class_utf8data++ = XCL_SINGLE;
2678     }
2679     else
2680     {
2681     *class_utf8data++ = XCL_RANGE;
2682     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2683     }
2684     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2685     }
2686     }
2687     #endif /* SUPPORT_UCP */
2688    
2689     /* Now record the original range, possibly modified for UCP caseless
2690     overlapping ranges. */
2691    
2692     *class_utf8data++ = XCL_RANGE;
2693     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2694     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2695    
2696     /* With UCP support, we are done. Without UCP support, there is no
2697     caseless matching for UTF-8 characters > 127; we can use the bit map
2698     for the smaller ones. */
2699    
2700     #ifdef SUPPORT_UCP
2701     continue; /* With next character in the class */
2702     #else
2703     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2704    
2705     /* Adjust upper limit and fall through to set up the map */
2706    
2707     d = 127;
2708    
2709     #endif /* SUPPORT_UCP */
2710     }
2711     #endif /* SUPPORT_UTF8 */
2712    
2713     /* We use the bit map for all cases when not in UTF-8 mode; else
2714     ranges that lie entirely within 0-127 when there is UCP support; else
2715     for partial ranges without UCP support. */
2716    
2717 nigel 93 class_charcount += d - c + 1;
2718     class_lastchar = d;
2719    
2720     /* We can save a bit of time by skipping this in the pre-compile. */
2721    
2722     if (lengthptr == NULL) for (; c <= d; c++)
2723 nigel 77 {
2724     classbits[c/8] |= (1 << (c&7));
2725     if ((options & PCRE_CASELESS) != 0)
2726     {
2727     int uc = cd->fcc[c]; /* flip case */
2728     classbits[uc/8] |= (1 << (uc&7));
2729     }
2730     }
2731    
2732     continue; /* Go get the next char in the class */
2733     }
2734    
2735     /* Handle a lone single character - we can get here for a normal
2736     non-escape char, or after \ that introduces a single character or for an
2737     apparent range that isn't. */
2738    
2739     LONE_SINGLE_CHARACTER:
2740    
2741     /* Handle a character that cannot go in the bit map */
2742    
2743     #ifdef SUPPORT_UTF8
2744     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2745     {
2746     class_utf8 = TRUE;
2747     *class_utf8data++ = XCL_SINGLE;
2748     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2749    
2750     #ifdef SUPPORT_UCP
2751     if ((options & PCRE_CASELESS) != 0)
2752     {
2753 nigel 93 unsigned int othercase;
2754     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2755 nigel 77 {
2756     *class_utf8data++ = XCL_SINGLE;
2757     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2758     }
2759     }
2760     #endif /* SUPPORT_UCP */
2761    
2762     }
2763     else
2764     #endif /* SUPPORT_UTF8 */
2765    
2766     /* Handle a single-byte character */
2767     {
2768     classbits[c/8] |= (1 << (c&7));
2769     if ((options & PCRE_CASELESS) != 0)
2770     {
2771     c = cd->fcc[c]; /* flip case */
2772     classbits[c/8] |= (1 << (c&7));
2773     }
2774     class_charcount++;
2775     class_lastchar = c;
2776     }
2777     }
2778    
2779 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2780 nigel 77
2781 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2782 nigel 77
2783 nigel 93 if (c == 0) /* Missing terminating ']' */
2784     {
2785     *errorcodeptr = ERR6;
2786     goto FAILED;
2787     }
2788    
2789 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2790     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2791     can optimize the negative case only if there were no characters >= 128
2792     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2793     single-bytes only. This is an historical hangover. Maybe one day we can
2794     tidy these opcodes to handle multi-byte characters.
2795    
2796     The optimization throws away the bit map. We turn the item into a
2797     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2798     that OP_NOT does not support multibyte characters. In the positive case, it
2799     can cause firstbyte to be set. Otherwise, there can be no first char if
2800     this item is first, whatever repeat count may follow. In the case of
2801     reqbyte, save the previous value for reinstating. */
2802    
2803     #ifdef SUPPORT_UTF8
2804     if (class_charcount == 1 &&
2805     (!utf8 ||
2806     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2807    
2808     #else
2809     if (class_charcount == 1)
2810     #endif
2811     {
2812     zeroreqbyte = reqbyte;
2813    
2814     /* The OP_NOT opcode works on one-byte characters only. */
2815    
2816     if (negate_class)
2817     {
2818     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2819     zerofirstbyte = firstbyte;
2820     *code++ = OP_NOT;
2821     *code++ = class_lastchar;
2822     break;
2823     }
2824    
2825     /* For a single, positive character, get the value into mcbuffer, and
2826     then we can handle this with the normal one-character code. */
2827    
2828     #ifdef SUPPORT_UTF8
2829     if (utf8 && class_lastchar > 127)
2830     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2831     else
2832     #endif
2833     {
2834     mcbuffer[0] = class_lastchar;
2835     mclength = 1;
2836     }
2837     goto ONE_CHAR;
2838     } /* End of 1-char optimization */
2839    
2840     /* The general case - not the one-char optimization. If this is the first
2841     thing in the branch, there can be no first char setting, whatever the
2842     repeat count. Any reqbyte setting must remain unchanged after any kind of
2843     repeat. */
2844    
2845     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846     zerofirstbyte = firstbyte;
2847     zeroreqbyte = reqbyte;
2848    
2849     /* If there are characters with values > 255, we have to compile an
2850     extended class, with its own opcode. If there are no characters < 256,
2851 nigel 93 we can omit the bitmap in the actual compiled code. */
2852 nigel 77
2853     #ifdef SUPPORT_UTF8
2854     if (class_utf8)
2855     {
2856     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2857     *code++ = OP_XCLASS;
2858     code += LINK_SIZE;
2859     *code = negate_class? XCL_NOT : 0;
2860    
2861 nigel 93 /* If the map is required, move up the extra data to make room for it;
2862     otherwise just move the code pointer to the end of the extra data. */
2863 nigel 77
2864     if (class_charcount > 0)
2865     {
2866     *code++ |= XCL_MAP;
2867 nigel 93 memmove(code + 32, code, class_utf8data - code);
2868 nigel 77 memcpy(code, classbits, 32);
2869 nigel 93 code = class_utf8data + 32;
2870 nigel 77 }
2871 nigel 93 else code = class_utf8data;
2872 nigel 77
2873     /* Now fill in the complete length of the item */
2874    
2875     PUT(previous, 1, code - previous);
2876     break; /* End of class handling */
2877     }
2878     #endif
2879    
2880     /* If there are no characters > 255, negate the 32-byte map if necessary,
2881     and copy it into the code vector. If this is the first thing in the branch,
2882     there can be no first char setting, whatever the repeat count. Any reqbyte
2883     setting must remain unchanged after any kind of repeat. */
2884    
2885     if (negate_class)
2886     {
2887     *code++ = OP_NCLASS;
2888 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2889     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2890 nigel 77 }
2891     else
2892     {
2893     *code++ = OP_CLASS;
2894     memcpy(code, classbits, 32);
2895     }
2896     code += 32;
2897     break;
2898    
2899 nigel 93
2900     /* ===================================================================*/
2901 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2902     has been tested above. */
2903    
2904     case '{':
2905     if (!is_quantifier) goto NORMAL_CHAR;
2906     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2907     if (*errorcodeptr != 0) goto FAILED;
2908     goto REPEAT;
2909    
2910     case '*':
2911     repeat_min = 0;
2912     repeat_max = -1;
2913     goto REPEAT;
2914    
2915     case '+':
2916     repeat_min = 1;
2917     repeat_max = -1;
2918     goto REPEAT;
2919    
2920     case '?':
2921     repeat_min = 0;
2922     repeat_max = 1;
2923    
2924     REPEAT:
2925     if (previous == NULL)
2926     {
2927     *errorcodeptr = ERR9;
2928     goto FAILED;
2929     }
2930    
2931     if (repeat_min == 0)
2932     {
2933     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2934     reqbyte = zeroreqbyte; /* Ditto */
2935     }
2936    
2937     /* Remember whether this is a variable length repeat */
2938    
2939     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2940    
2941     op_type = 0; /* Default single-char op codes */
2942     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2943    
2944     /* Save start of previous item, in case we have to move it up to make space
2945     for an inserted OP_ONCE for the additional '+' extension. */
2946    
2947     tempcode = previous;
2948    
2949     /* If the next character is '+', we have a possessive quantifier. This
2950     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2951     If the next character is '?' this is a minimizing repeat, by default,
2952     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2953     repeat type to the non-default. */
2954    
2955     if (ptr[1] == '+')
2956     {
2957     repeat_type = 0; /* Force greedy */
2958     possessive_quantifier = TRUE;
2959     ptr++;
2960     }
2961     else if (ptr[1] == '?')
2962     {
2963     repeat_type = greedy_non_default;
2964     ptr++;
2965     }
2966     else repeat_type = greedy_default;
2967    
2968     /* If previous was a character match, abolish the item and generate a
2969     repeat item instead. If a char item has a minumum of more than one, ensure
2970     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2971     the first thing in a branch because the x will have gone into firstbyte
2972     instead. */
2973    
2974     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2975     {
2976     /* Deal with UTF-8 characters that take up more than one byte. It's
2977     easier to write this out separately than try to macrify it. Use c to
2978     hold the length of the character in bytes, plus 0x80 to flag that it's a
2979     length rather than a small character. */
2980    
2981     #ifdef SUPPORT_UTF8
2982     if (utf8 && (code[-1] & 0x80) != 0)
2983     {
2984     uschar *lastchar = code - 1;
2985     while((*lastchar & 0xc0) == 0x80) lastchar--;
2986     c = code - lastchar; /* Length of UTF-8 character */
2987     memcpy(utf8_char, lastchar, c); /* Save the char */
2988     c |= 0x80; /* Flag c as a length */
2989     }
2990     else
2991     #endif
2992    
2993     /* Handle the case of a single byte - either with no UTF8 support, or
2994     with UTF-8 disabled, or for a UTF-8 character < 128. */
2995    
2996     {
2997     c = code[-1];
2998     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2999     }
3000    
3001 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3002     the line is something that cannot possibly match this character. If so,
3003     automatically possessifying this item gains some performance in the case
3004     where the match fails. */
3005    
3006     if (!possessive_quantifier &&
3007     repeat_max < 0 &&
3008     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3009     options, cd))
3010     {
3011     repeat_type = 0; /* Force greedy */
3012     possessive_quantifier = TRUE;
3013     }
3014    
3015 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3016     }
3017    
3018     /* If previous was a single negated character ([^a] or similar), we use
3019     one of the special opcodes, replacing it. The code is shared with single-
3020     character repeats by setting opt_type to add a suitable offset into
3021 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3022     currently used only for single-byte chars. */
3023 nigel 77
3024     else if (*previous == OP_NOT)
3025     {
3026     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3027     c = previous[1];
3028 nigel 93 if (!possessive_quantifier &&
3029     repeat_max < 0 &&
3030     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3031     {
3032     repeat_type = 0; /* Force greedy */
3033     possessive_quantifier = TRUE;
3034     }
3035 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3036     }
3037    
3038     /* If previous was a character type match (\d or similar), abolish it and
3039     create a suitable repeat item. The code is shared with single-character
3040     repeats by setting op_type to add a suitable offset into repeat_type. Note
3041     the the Unicode property types will be present only when SUPPORT_UCP is
3042     defined, but we don't wrap the little bits of code here because it just
3043     makes it horribly messy. */
3044    
3045     else if (*previous < OP_EODN)
3046     {
3047     uschar *oldcode;
3048 nigel 87 int prop_type, prop_value;
3049 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3050     c = *previous;
3051    
3052 nigel 93 if (!possessive_quantifier &&
3053     repeat_max < 0 &&
3054     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3055     {
3056     repeat_type = 0; /* Force greedy */
3057     possessive_quantifier = TRUE;
3058     }
3059    
3060 nigel 77 OUTPUT_SINGLE_REPEAT:
3061 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3062     {
3063     prop_type = previous[1];
3064     prop_value = previous[2];
3065     }
3066     else prop_type = prop_value = -1;
3067 nigel 77
3068     oldcode = code;
3069     code = previous; /* Usually overwrite previous item */
3070    
3071     /* If the maximum is zero then the minimum must also be zero; Perl allows
3072     this case, so we do too - by simply omitting the item altogether. */
3073    
3074     if (repeat_max == 0) goto END_REPEAT;
3075    
3076     /* All real repeats make it impossible to handle partial matching (maybe
3077     one day we will be able to remove this restriction). */
3078    
3079     if (repeat_max != 1) cd->nopartial = TRUE;
3080    
3081     /* Combine the op_type with the repeat_type */
3082    
3083     repeat_type += op_type;
3084    
3085     /* A minimum of zero is handled either as the special case * or ?, or as
3086     an UPTO, with the maximum given. */
3087    
3088     if (repeat_min == 0)
3089     {
3090     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3091     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3092     else
3093     {
3094     *code++ = OP_UPTO + repeat_type;
3095     PUT2INC(code, 0, repeat_max);
3096     }
3097     }
3098    
3099     /* A repeat minimum of 1 is optimized into some special cases. If the
3100 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3101 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3102     one less than the maximum. */
3103    
3104     else if (repeat_min == 1)
3105     {
3106     if (repeat_max == -1)
3107     *code++ = OP_PLUS + repeat_type;
3108     else
3109     {
3110     code = oldcode; /* leave previous item in place */
3111     if (repeat_max == 1) goto END_REPEAT;
3112     *code++ = OP_UPTO + repeat_type;
3113     PUT2INC(code, 0, repeat_max - 1);
3114     }
3115     }
3116    
3117     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3118     handled as an EXACT followed by an UPTO. */
3119    
3120     else
3121     {
3122     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3123     PUT2INC(code, 0, repeat_min);
3124    
3125     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3126     we have to insert the character for the previous code. For a repeated
3127 nigel 87 Unicode property match, there are two extra bytes that define the
3128 nigel 77 required property. In UTF-8 mode, long characters have their length in
3129     c, with the 0x80 bit as a flag. */
3130    
3131     if (repeat_max < 0)
3132     {
3133     #ifdef SUPPORT_UTF8
3134     if (utf8 && c >= 128)
3135     {
3136     memcpy(code, utf8_char, c & 7);
3137     code += c & 7;
3138     }
3139     else
3140     #endif
3141     {
3142     *code++ = c;
3143 nigel 87 if (prop_type >= 0)
3144     {
3145     *code++ = prop_type;
3146     *code++ = prop_value;
3147     }
3148 nigel 77 }
3149     *code++ = OP_STAR + repeat_type;
3150     }
3151    
3152     /* Else insert an UPTO if the max is greater than the min, again
3153 nigel 93 preceded by the character, for the previously inserted code. If the
3154     UPTO is just for 1 instance, we can use QUERY instead. */
3155 nigel 77
3156     else if (repeat_max != repeat_min)
3157     {
3158     #ifdef SUPPORT_UTF8
3159     if (utf8 && c >= 128)
3160     {
3161     memcpy(code, utf8_char, c & 7);
3162     code += c & 7;
3163     }
3164     else
3165     #endif
3166     *code++ = c;
3167 nigel 87 if (prop_type >= 0)
3168     {
3169     *code++ = prop_type;
3170     *code++ = prop_value;
3171     }
3172 nigel 77 repeat_max -= repeat_min;
3173 nigel 93
3174     if (repeat_max == 1)
3175     {
3176     *code++ = OP_QUERY + repeat_type;
3177     }
3178     else
3179     {
3180     *code++ = OP_UPTO + repeat_type;
3181     PUT2INC(code, 0, repeat_max);
3182     }
3183 nigel 77 }
3184     }
3185    
3186     /* The character or character type itself comes last in all cases. */
3187    
3188     #ifdef SUPPORT_UTF8
3189     if (utf8 && c >= 128)
3190     {
3191     memcpy(code, utf8_char, c & 7);
3192     code += c & 7;
3193     }
3194     else
3195     #endif
3196     *code++ = c;
3197    
3198 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3199     define the required property. */
3200 nigel 77
3201     #ifdef SUPPORT_UCP
3202 nigel 87 if (prop_type >= 0)
3203     {
3204     *code++ = prop_type;
3205     *code++ = prop_value;
3206     }
3207 nigel 77 #endif
3208     }
3209    
3210     /* If previous was a character class or a back reference, we put the repeat
3211     stuff after it, but just skip the item if the repeat was {0,0}. */
3212    
3213     else if (*previous == OP_CLASS ||
3214     *previous == OP_NCLASS ||
3215     #ifdef SUPPORT_UTF8
3216     *previous == OP_XCLASS ||
3217     #endif
3218     *previous == OP_REF)
3219     {
3220     if (repeat_max == 0)
3221     {
3222     code = previous;
3223     goto END_REPEAT;
3224     }
3225    
3226     /* All real repeats make it impossible to handle partial matching (maybe
3227     one day we will be able to remove this restriction). */
3228    
3229     if (repeat_max != 1) cd->nopartial = TRUE;
3230    
3231     if (repeat_min == 0 && repeat_max == -1)
3232     *code++ = OP_CRSTAR + repeat_type;
3233     else if (repeat_min == 1 && repeat_max == -1)
3234     *code++ = OP_CRPLUS + repeat_type;
3235     else if (repeat_min == 0 && repeat_max == 1)
3236     *code++ = OP_CRQUERY + repeat_type;
3237     else
3238     {
3239     *code++ = OP_CRRANGE + repeat_type;
3240     PUT2INC(code, 0, repeat_min);
3241     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3242     PUT2INC(code, 0, repeat_max);
3243     }
3244     }
3245    
3246     /* If previous was a bracket group, we may have to replicate it in certain
3247     cases. */
3248    
3249 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3250     *previous == OP_ONCE || *previous == OP_COND)
3251 nigel 77 {
3252     register int i;
3253     int ketoffset = 0;
3254     int len = code - previous;
3255     uschar *bralink = NULL;
3256    
3257 nigel 93 /* Repeating a DEFINE group is pointless */
3258    
3259     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3260     {
3261     *errorcodeptr = ERR55;
3262     goto FAILED;
3263     }
3264    
3265     /* This is a paranoid check to stop integer overflow later on */
3266    
3267     if (len > MAX_DUPLENGTH)
3268     {
3269     *errorcodeptr = ERR50;
3270     goto FAILED;
3271     }
3272    
3273 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3274     by scanning through from the start, and compute the offset back to it
3275     from the current code pointer. There may be an OP_OPT setting following
3276     the final KET, so we can't find the end just by going back from the code
3277     pointer. */
3278    
3279     if (repeat_max == -1)
3280     {
3281     register uschar *ket = previous;
3282     do ket += GET(ket, 1); while (*ket != OP_KET);
3283     ketoffset = code - ket;
3284     }
3285    
3286     /* The case of a zero minimum is special because of the need to stick
3287     OP_BRAZERO in front of it, and because the group appears once in the
3288     data, whereas in other cases it appears the minimum number of times. For
3289     this reason, it is simplest to treat this case separately, as otherwise
3290     the code gets far too messy. There are several special subcases when the
3291     minimum is zero. */
3292    
3293     if (repeat_min == 0)
3294     {
3295     /* If the maximum is also zero, we just omit the group from the output
3296     altogether. */
3297    
3298     if (repeat_max == 0)
3299     {
3300     code = previous;
3301     goto END_REPEAT;
3302     }
3303    
3304     /* If the maximum is 1 or unlimited, we just have to stick in the
3305     BRAZERO and do no more at this point. However, we do need to adjust
3306     any OP_RECURSE calls inside the group that refer to the group itself or
3307 nigel 93 any internal or forward referenced group, because the offset is from
3308     the start of the whole regex. Temporarily terminate the pattern while
3309     doing this. */
3310 nigel 77
3311     if (repeat_max <= 1)
3312     {
3313     *code = OP_END;
3314 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3315 nigel 77 memmove(previous+1, previous, len);
3316     code++;
3317     *previous++ = OP_BRAZERO + repeat_type;
3318     }
3319    
3320     /* If the maximum is greater than 1 and limited, we have to replicate
3321     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3322     The first one has to be handled carefully because it's the original
3323     copy, which has to be moved up. The remainder can be handled by code
3324     that is common with the non-zero minimum case below. We have to
3325     adjust the value or repeat_max, since one less copy is required. Once
3326     again, we may have to adjust any OP_RECURSE calls inside the group. */
3327    
3328     else
3329     {
3330     int offset;
3331     *code = OP_END;
3332 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3333 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3334     code += 2 + LINK_SIZE;
3335     *previous++ = OP_BRAZERO + repeat_type;
3336     *previous++ = OP_BRA;
3337    
3338     /* We chain together the bracket offset fields that have to be
3339     filled in later when the ends of the brackets are reached. */
3340    
3341     offset = (bralink == NULL)? 0 : previous - bralink;
3342     bralink = previous;
3343     PUTINC(previous, 0, offset);
3344     }
3345    
3346     repeat_max--;
3347     }
3348    
3349     /* If the minimum is greater than zero, replicate the group as many
3350     times as necessary, and adjust the maximum to the number of subsequent
3351     copies that we need. If we set a first char from the group, and didn't
3352 nigel 93 set a required char, copy the latter from the former. If there are any
3353     forward reference subroutine calls in the group, there will be entries on
3354     the workspace list; replicate these with an appropriate increment. */
3355 nigel 77
3356     else
3357     {
3358     if (repeat_min > 1)
3359     {
3360 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3361     just adjust the length as if we had. */
3362    
3363     if (lengthptr != NULL)
3364     *lengthptr += (repeat_min - 1)*length_prevgroup;
3365    
3366     /* This is compiling for real */
3367    
3368     else
3369 nigel 77 {
3370 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3371     for (i = 1; i < repeat_min; i++)
3372     {
3373     uschar *hc;
3374     uschar *this_hwm = cd->hwm;
3375     memcpy(code, previous, len);
3376     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3377     {
3378     PUT(cd->hwm, 0, GET(hc, 0) + len);
3379     cd->hwm += LINK_SIZE;
3380     }
3381     save_hwm = this_hwm;
3382     code += len;
3383     }
3384 nigel 77 }
3385     }
3386 nigel 93
3387 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3388     }
3389    
3390     /* This code is common to both the zero and non-zero minimum cases. If
3391     the maximum is limited, it replicates the group in a nested fashion,
3392     remembering the bracket starts on a stack. In the case of a zero minimum,
3393     the first one was set up above. In all cases the repeat_max now specifies
3394 nigel 93 the number of additional copies needed. Again, we must remember to
3395     replicate entries on the forward reference list. */
3396 nigel 77
3397     if (repeat_max >= 0)
3398     {
3399 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3400     just adjust the length as if we had. For each repetition we must add 1
3401     to the length for BRAZERO and for all but the last repetition we must
3402     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3403    
3404     if (lengthptr != NULL && repeat_max > 0)
3405     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3406     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3407    
3408     /* This is compiling for real */
3409    
3410     else for (i = repeat_max - 1; i >= 0; i--)
3411 nigel 77 {
3412 nigel 93 uschar *hc;
3413     uschar *this_hwm = cd->hwm;
3414    
3415 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3416    
3417     /* All but the final copy start a new nesting, maintaining the
3418     chain of brackets outstanding. */
3419    
3420     if (i != 0)
3421     {
3422     int offset;
3423     *code++ = OP_BRA;
3424     offset = (bralink == NULL)? 0 : code - bralink;
3425     bralink = code;
3426     PUTINC(code, 0, offset);
3427     }
3428    
3429     memcpy(code, previous, len);
3430 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3431     {
3432     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3433     cd->hwm += LINK_SIZE;
3434     }
3435     save_hwm = this_hwm;
3436 nigel 77 code += len;
3437     }
3438    
3439     /* Now chain through the pending brackets, and fill in their length
3440     fields (which are holding the chain links pro tem). */
3441    
3442     while (bralink != NULL)
3443     {
3444     int oldlinkoffset;
3445     int offset = code - bralink + 1;
3446     uschar *bra = code - offset;
3447     oldlinkoffset = GET(bra, 1);
3448     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3449     *code++ = OP_KET;
3450     PUTINC(code, 0, offset);
3451     PUT(bra, 1, offset);
3452     }
3453     }
3454    
3455     /* If the maximum is unlimited, set a repeater in the final copy. We
3456     can't just offset backwards from the current code point, because we
3457     don't know if there's been an options resetting after the ket. The
3458 nigel 93 correct offset was computed above.
3459 nigel 77
3460 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3461     this group is a non-atomic one that could match an empty string. If so,
3462     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3463     that runtime checking can be done. [This check is also applied to
3464     atomic groups at runtime, but in a different way.] */
3465    
3466     else
3467     {
3468     uschar *ketcode = code - ketoffset;
3469     uschar *bracode = ketcode - GET(ketcode, 1);
3470     *ketcode = OP_KETRMAX + repeat_type;
3471     if (lengthptr == NULL && *bracode != OP_ONCE)
3472     {
3473     uschar *scode = bracode;
3474     do
3475     {
3476     if (could_be_empty_branch(scode, ketcode, utf8))
3477     {
3478     *bracode += OP_SBRA - OP_BRA;
3479     break;
3480     }
3481     scode += GET(scode, 1);
3482     }
3483     while (*scode == OP_ALT);
3484     }
3485     }
3486 nigel 77 }
3487    
3488     /* Else there's some kind of shambles */
3489    
3490     else
3491     {
3492     *errorcodeptr = ERR11;
3493     goto FAILED;
3494     }
3495    
3496 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3497     tests above succeeded, possessive_quantifier is TRUE. For some of the
3498     simpler opcodes, there is an special alternative opcode for this. For
3499     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3500     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3501     but the special opcodes can optimize it a bit. The repeated item starts at
3502     tempcode, not at previous, which might be the first part of a string whose
3503     (former) last char we repeated.
3504 nigel 77
3505 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3506     an 'upto' may follow. We skip over an 'exact' item, and then test the
3507     length of what remains before proceeding. */
3508    
3509 nigel 77 if (possessive_quantifier)
3510     {
3511 nigel 93 int len;
3512     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3513     *tempcode == OP_NOTEXACT)
3514     tempcode += _pcre_OP_lengths[*tempcode];
3515     len = code - tempcode;
3516     if (len > 0) switch (*tempcode)
3517     {
3518     case OP_STAR: *tempcode = OP_POSSTAR; break;
3519     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3520     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3521     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3522    
3523     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3524     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3525     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3526     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3527    
3528     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3529     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3530     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3531     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3532    
3533     default:
3534     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3535     code += 1 + LINK_SIZE;
3536     len += 1 + LINK_SIZE;
3537     tempcode[0] = OP_ONCE;
3538     *code++ = OP_KET;
3539     PUTINC(code, 0, len);
3540     PUT(tempcode, 1, len);
3541     break;
3542     }
3543 nigel 77 }
3544    
3545     /* In all case we no longer have a previous item. We also set the
3546     "follows varying string" flag for subsequently encountered reqbytes if
3547     it isn't already set and we have just passed a varying length item. */
3548    
3549     END_REPEAT:
3550     previous = NULL;
3551     cd->req_varyopt |= reqvary;
3552     break;
3553    
3554    
3555 nigel 93 /* ===================================================================*/
3556     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3557     lookbehind or option setting or condition or all the other extended
3558     parenthesis forms. First deal with the specials; all are introduced by ?,
3559     and the appearance of any of them means that this is not a capturing
3560     group. */
3561 nigel 77
3562     case '(':
3563     newoptions = options;
3564     skipbytes = 0;
3565 nigel 93 bravalue = OP_CBRA;
3566     save_hwm = cd->hwm;
3567 nigel 77
3568     if (*(++ptr) == '?')
3569     {
3570 nigel 93 int i, set, unset, namelen;
3571 nigel 77 int *optset;
3572 nigel 93 const uschar *name;
3573     uschar *slot;
3574 nigel 77
3575     switch (*(++ptr))
3576     {
3577     case '#': /* Comment; skip to ket */
3578     ptr++;
3579 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3580     if (*ptr == 0)
3581     {
3582     *errorcodeptr = ERR18;
3583     goto FAILED;
3584     }
3585 nigel 77 continue;
3586    
3587 nigel 93
3588     /* ------------------------------------------------------------ */
3589     case ':': /* Non-capturing bracket */
3590 nigel 77 bravalue = OP_BRA;
3591     ptr++;
3592     break;
3593    
3594 nigel 93
3595     /* ------------------------------------------------------------ */
3596 nigel 77 case '(':
3597     bravalue = OP_COND; /* Conditional group */
3598    
3599 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3600     group), a name (referring to a named group), or 'R', referring to
3601     recursion. R<digits> and R&name are also permitted for recursion tests.
3602 nigel 77
3603 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3604     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3605    
3606     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3607     be the recursive thing or the name 'R' (and similarly for 'R' followed
3608     by digits), and (b) a number could be a name that consists of digits.
3609     In both cases, we look for a name first; if not found, we try the other
3610     cases. */
3611    
3612     /* For conditions that are assertions, check the syntax, and then exit
3613     the switch. This will take control down to where bracketed groups,
3614     including assertions, are processed. */
3615    
3616     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3617     break;
3618    
3619     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3620     below), and all need to skip 3 bytes at the start of the group. */
3621    
3622     code[1+LINK_SIZE] = OP_CREF;
3623     skipbytes = 3;
3624    
3625     /* Check for a test for recursion in a named group. */
3626    
3627     if (ptr[1] == 'R' && ptr[2] == '&')
3628 nigel 77 {
3629 nigel 93 terminator = -1;
3630     ptr += 2;
3631     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3632     }
3633 nigel 91
3634 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3635     syntax (?(<name>) or (?('name') */
3636 nigel 91
3637 nigel 93 else if (ptr[1] == '<')
3638     {
3639     terminator = '>';
3640     ptr++;
3641     }
3642     else if (ptr[1] == '\'')
3643     {
3644     terminator = '\'';
3645     ptr++;
3646     }
3647     else terminator = 0;
3648 nigel 77
3649 nigel 93 /* We now expect to read a name; any thing else is an error */
3650 nigel 77
3651 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3652     {
3653     ptr += 1; /* To get the right offset */
3654     *errorcodeptr = ERR28;
3655     goto FAILED;
3656     }
3657    
3658     /* Read the name, but also get it as a number if it's all digits */
3659    
3660     recno = 0;
3661     name = ++ptr;
3662     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3663     {
3664     if (recno >= 0)
3665     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3666     recno * 10 + *ptr - '0' : -1;
3667 nigel 91 ptr++;
3668 nigel 93 }
3669     namelen = ptr - name;
3670 nigel 91
3671 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3672     {
3673     ptr--; /* Error offset */
3674     *errorcodeptr = ERR26;
3675     goto FAILED;
3676     }
3677 nigel 91
3678 nigel 93 /* Do no further checking in the pre-compile phase. */
3679 nigel 91
3680 nigel 93 if (lengthptr != NULL) break;
3681 nigel 91
3682 nigel 93 /* In the real compile we do the work of looking for the actual
3683     reference. */
3684 nigel 91
3685 nigel 93 slot = cd->name_table;
3686     for (i = 0; i < cd->names_found; i++)
3687     {
3688     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3689     slot += cd->name_entry_size;
3690     }
3691 nigel 91
3692 nigel 93 /* Found a previous named subpattern */
3693 nigel 91
3694 nigel 93 if (i < cd->names_found)
3695     {
3696     recno = GET2(slot, 0);
3697     PUT2(code, 2+LINK_SIZE, recno);
3698     }
3699 nigel 91
3700 nigel 93 /* Search the pattern for a forward reference */
3701 nigel 91
3702 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3703     (options & PCRE_EXTENDED) != 0)) > 0)
3704     {
3705     PUT2(code, 2+LINK_SIZE, i);
3706     }
3707 nigel 91
3708 nigel 93 /* If terminator == 0 it means that the name followed directly after
3709     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3710     some further alternatives to try. For the cases where terminator != 0
3711     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3712     now checked all the possibilities, so give an error. */
3713 nigel 91
3714 nigel 93 else if (terminator != 0)
3715     {
3716     *errorcodeptr = ERR15;
3717     goto FAILED;
3718     }
3719    
3720     /* Check for (?(R) for recursion. Allow digits after R to specify a
3721     specific group number. */
3722    
3723     else if (*name == 'R')
3724     {
3725     recno = 0;
3726     for (i = 1; i < namelen; i++)
3727 nigel 91 {
3728 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3729     {
3730     *errorcodeptr = ERR15;
3731     goto FAILED;
3732     }
3733     recno = recno * 10 + name[i] - '0';
3734 nigel 77 }
3735 nigel 93 if (recno == 0) recno = RREF_ANY;
3736     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3737     PUT2(code, 2+LINK_SIZE, recno);
3738 nigel 77 }
3739 nigel 91
3740 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3741     false. */
3742 nigel 91
3743 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3744     {
3745     code[1+LINK_SIZE] = OP_DEF;
3746     skipbytes = 1;
3747     }
3748    
3749     /* Check for the "name" actually being a subpattern number. */
3750    
3751     else if (recno > 0)
3752     {
3753     PUT2(code, 2+LINK_SIZE, recno);
3754     }
3755    
3756     /* Either an unidentified subpattern, or a reference to (?(0) */
3757    
3758     else
3759     {
3760     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3761     goto FAILED;
3762     }
3763 nigel 77 break;
3764    
3765 nigel 93
3766     /* ------------------------------------------------------------ */
3767 nigel 77 case '=': /* Positive lookahead */
3768     bravalue = OP_ASSERT;
3769     ptr++;
3770     break;
3771    
3772 nigel 93
3773     /* ------------------------------------------------------------ */
3774 nigel 77 case '!': /* Negative lookahead */
3775     bravalue = OP_ASSERT_NOT;
3776     ptr++;
3777     break;
3778    
3779 nigel 93
3780     /* ------------------------------------------------------------ */
3781     case '<': /* Lookbehind or named define */
3782     switch (ptr[1])
3783 nigel 77 {
3784     case '=': /* Positive lookbehind */
3785     bravalue = OP_ASSERTBACK;
3786 nigel 93 ptr += 2;
3787 nigel 77 break;
3788    
3789     case '!': /* Negative lookbehind */
3790     bravalue = OP_ASSERTBACK_NOT;
3791 nigel 93 ptr += 2;
3792 nigel 77 break;
3793 nigel 93
3794     default: /* Could be name define, else bad */
3795     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3796     ptr++; /* Correct offset for error */
3797     *errorcodeptr = ERR24;
3798     goto FAILED;
3799 nigel 77 }
3800     break;
3801    
3802 nigel 93
3803     /* ------------------------------------------------------------ */
3804 nigel 77 case '>': /* One-time brackets */
3805     bravalue = OP_ONCE;
3806     ptr++;
3807     break;
3808    
3809 nigel 93
3810     /* ------------------------------------------------------------ */
3811 nigel 77 case 'C': /* Callout - may be followed by digits; */
3812     previous_callout = code; /* Save for later completion */
3813     after_manual_callout = 1; /* Skip one item before completing */
3814 nigel 93 *code++ = OP_CALLOUT;
3815     {
3816 nigel 77 int n = 0;
3817     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3818     n = n * 10 + *ptr - '0';
3819 nigel 93 if (*ptr != ')')
3820     {
3821     *errorcodeptr = ERR39;
3822     goto FAILED;
3823     }
3824 nigel 77 if (n > 255)
3825     {
3826     *errorcodeptr = ERR38;
3827     goto FAILED;
3828     }
3829     *code++ = n;
3830     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3831     PUT(code, LINK_SIZE, 0); /* Default length */
3832     code += 2 * LINK_SIZE;
3833     }
3834     previous = NULL;
3835     continue;
3836    
3837 nigel 93
3838     /* ------------------------------------------------------------ */
3839     case 'P': /* Python-style named subpattern handling */
3840     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3841 nigel 77 {
3842 nigel 93 is_recurse = *ptr == '>';
3843     terminator = ')';
3844     goto NAMED_REF_OR_RECURSE;
3845     }
3846     else if (*ptr != '<') /* Test for Python-style definition */
3847     {
3848     *errorcodeptr = ERR41;
3849     goto FAILED;
3850     }
3851     /* Fall through to handle (?P< as (?< is handled */
3852 nigel 77
3853    
3854 nigel 93 /* ------------------------------------------------------------ */
3855     DEFINE_NAME: /* Come here from (?< handling */
3856     case '\'':
3857     {
3858     terminator = (*ptr == '<')? '>' : '\'';
3859     name = ++ptr;
3860    
3861     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3862     namelen = ptr - name;
3863    
3864     /* In the pre-compile phase, just do a syntax check. */
3865    
3866     if (lengthptr != NULL)
3867 nigel 77 {
3868 nigel 93 if (*ptr != terminator)
3869 nigel 77 {
3870 nigel 93 *errorcodeptr = ERR42;
3871     goto FAILED;
3872     }
3873     if (cd->names_found >= MAX_NAME_COUNT)
3874     {
3875     *errorcodeptr = ERR49;
3876     goto FAILED;
3877     }
3878     if (namelen + 3 > cd->name_entry_size)
3879     {
3880     cd->name_entry_size = namelen + 3;
3881     if (namelen > MAX_NAME_SIZE)
3882 nigel 77 {
3883 nigel 93 *errorcodeptr = ERR48;
3884     goto FAILED;
3885     }
3886     }
3887     }
3888    
3889     /* In the real compile, create the entry in the table */
3890    
3891     else
3892     {
3893     slot = cd->name_table;
3894     for (i = 0; i < cd->names_found; i++)
3895     {
3896     int crc = memcmp(name, slot+2, namelen);
3897     if (crc == 0)
3898     {
3899     if (slot[2+namelen] == 0)
3900 nigel 91 {
3901 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3902     {
3903     *errorcodeptr = ERR43;
3904     goto FAILED;
3905     }
3906 nigel 91 }
3907 nigel 93 else crc = -1; /* Current name is substring */
3908 nigel 77 }
3909 nigel 93 if (crc < 0)
3910     {
3911     memmove(slot + cd->name_entry_size, slot,
3912     (cd->names_found - i) * cd->name_entry_size);
3913     break;
3914     }
3915     slot += cd->name_entry_size;
3916 nigel 77 }
3917 nigel 93
3918     PUT2(slot, 0, cd->bracount + 1);
3919     memcpy(slot + 2, name, namelen);
3920     slot[2+namelen] = 0;
3921 nigel 77 }
3922     }
3923    
3924 nigel 93 /* In both cases, count the number of names we've encountered. */
3925    
3926     ptr++; /* Move past > or ' */
3927     cd->names_found++;
3928     goto NUMBERED_GROUP;
3929    
3930    
3931     /* ------------------------------------------------------------ */
3932     case '&': /* Perl recursion/subroutine syntax */
3933     terminator = ')';
3934     is_recurse = TRUE;
3935     /* Fall through */
3936    
3937     /* We come here from the Python syntax above that handles both
3938     references (?P=name) and recursion (?P>name), as well as falling
3939     through from the Perl recursion syntax (?&name). */
3940    
3941     NAMED_REF_OR_RECURSE:
3942     name = ++ptr;
3943     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3944     namelen = ptr - name;
3945    
3946     /* In the pre-compile phase, do a syntax check and set a dummy
3947     reference number. */
3948    
3949     if (lengthptr != NULL)
3950 nigel 77 {
3951 nigel 93 if (*ptr != terminator)
3952     {
3953     *errorcodeptr = ERR42;
3954     goto FAILED;
3955     }
3956     if (namelen > MAX_NAME_SIZE)
3957     {
3958     *errorcodeptr = ERR48;
3959     goto FAILED;
3960     }
3961     recno = 0;
3962     }
3963 nigel 77
3964 nigel 93 /* In the real compile, seek the name in the table */
3965 nigel 77
3966 nigel 93 else
3967     {
3968     slot = cd->name_table;
3969 nigel 77 for (i = 0; i < cd->names_found; i++)
3970     {
3971     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3972     slot += cd->name_entry_size;
3973     }
3974 nigel 91
3975     if (i < cd->names_found) /* Back reference */
3976 nigel 77 {
3977 nigel 91 recno = GET2(slot, 0);
3978     }
3979     else if ((recno = /* Forward back reference */
3980 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
3981     (options & PCRE_EXTENDED) != 0)) <= 0)
3982 nigel 91 {
3983 nigel 77 *errorcodeptr = ERR15;
3984     goto FAILED;
3985     }
3986 nigel 93 }
3987 nigel 77
3988 nigel 93 /* In both phases, we can now go to the code than handles numerical
3989     recursion or backreferences. */
3990 nigel 77
3991 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
3992     else goto HANDLE_REFERENCE;
3993 nigel 77
3994    
3995 nigel 93 /* ------------------------------------------------------------ */
3996     case 'R': /* Recursion */
3997 nigel 77 ptr++; /* Same as (?0) */
3998     /* Fall through */
3999    
4000    
4001 nigel 93 /* ------------------------------------------------------------ */
4002     case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4003     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4004 nigel 77 {
4005     const uschar *called;
4006     recno = 0;
4007     while((digitab[*ptr] & ctype_digit) != 0)
4008     recno = recno * 10 + *ptr++ - '0';
4009 nigel 93 if (*ptr != ')')
4010     {
4011     *errorcodeptr = ERR29;
4012     goto FAILED;
4013     }
4014 nigel 77
4015     /* Come here from code above that handles a named recursion */
4016    
4017     HANDLE_RECURSION:
4018    
4019     previous = code;
4020 nigel 93 called = cd->start_code;
4021 nigel 77
4022 nigel 93 /* When we are actually compiling, find the bracket that is being
4023     referenced. Temporarily end the regex in case it doesn't exist before
4024     this point. If we end up with a forward reference, first check that
4025     the bracket does occur later so we can give the error (and position)
4026     now. Then remember this forward reference in the workspace so it can
4027     be filled in at the end. */
4028 nigel 77
4029 nigel 93 if (lengthptr == NULL)
4030 nigel 77 {
4031 nigel 93 *code = OP_END;
4032     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4033 nigel 77
4034 nigel 93 /* Forward reference */
4035 nigel 77
4036 nigel 93 if (called == NULL)
4037     {
4038     if (find_parens(ptr, cd->bracount, NULL, recno,
4039     (options & PCRE_EXTENDED) != 0) < 0)
4040     {
4041     *errorcodeptr = ERR15;
4042     goto FAILED;
4043     }
4044     called = cd->start_code + recno;
4045     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4046     }
4047    
4048     /* If not a forward reference, and the subpattern is still open,
4049     this is a recursive call. We check to see if this is a left
4050     recursion that could loop for ever, and diagnose that case. */
4051    
4052     else if (GET(called, 1) == 0 &&
4053     could_be_empty(called, code, bcptr, utf8))
4054     {
4055     *errorcodeptr = ERR40;
4056     goto FAILED;
4057     }
4058 nigel 77 }
4059    
4060 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
4061 nigel 93 "once" brackets. Set up a "previous group" length so that a
4062     subsequent quantifier will work. */
4063 nigel 77
4064 nigel 87 *code = OP_ONCE;
4065     PUT(code, 1, 2 + 2*LINK_SIZE);
4066     code += 1 + LINK_SIZE;
4067    
4068 nigel 77 *code = OP_RECURSE;
4069     PUT(code, 1, called - cd->start_code);
4070     code += 1 + LINK_SIZE;
4071 nigel 87
4072     *code = OP_KET;
4073     PUT(code, 1, 2 + 2*LINK_SIZE);
4074     code += 1 + LINK_SIZE;
4075 nigel 93
4076     length_prevgroup = 3 + 3*LINK_SIZE;
4077 nigel 77 }
4078 nigel 93
4079     /* Can't determine a first byte now */
4080    
4081     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4082 nigel 77 continue;
4083    
4084    
4085 nigel 93 /* ------------------------------------------------------------ */
4086     default: /* Other characters: check option setting */
4087 nigel 77 set = unset = 0;
4088     optset = &set;
4089    
4090     while (*ptr != ')' && *ptr != ':')
4091     {
4092     switch (*ptr++)
4093     {
4094     case '-': optset = &unset; break;
4095    
4096 nigel 93 case 'J': /* Record that it changed in the external options */
4097     *optset |= PCRE_DUPNAMES;
4098     cd->external_options |= PCRE_JCHANGED;
4099     break;
4100    
4101 nigel 77 case 'i': *optset |= PCRE_CASELESS; break;
4102     case 'm': *optset |= PCRE_MULTILINE; break;
4103     case 's': *optset |= PCRE_DOTALL; break;
4104     case 'x': *optset |= PCRE_EXTENDED; break;
4105     case 'U': *optset |= PCRE_UNGREEDY; break;
4106     case 'X': *optset |= PCRE_EXTRA; break;
4107 nigel 93
4108     default: *errorcodeptr = ERR12;
4109     ptr--; /* Correct the offset */
4110     goto FAILED;
4111 nigel 77 }
4112     }
4113    
4114     /* Set up the changed option bits, but don't change anything yet. */
4115    
4116     newoptions = (options | set) & (~unset);
4117    
4118     /* If the options ended with ')' this is not the start of a nested
4119 nigel 93 group with option changes, so the options change at this level. If this
4120     item is right at the start of the pattern, the options can be
4121     abstracted and made external in the pre-compile phase, and ignored in
4122     the compile phase. This can be helpful when matching -- for instance in
4123     caseless checking of required bytes.
4124 nigel 77
4125 nigel 93 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4126     definitely *not* at the start of the pattern because something has been
4127     compiled. In the pre-compile phase, however, the code pointer can have
4128     that value after the start, because it gets reset as code is discarded
4129     during the pre-compile. However, this can happen only at top level - if
4130     we are within parentheses, the starting BRA will still be present. At
4131     any parenthesis level, the length value can be used to test if anything
4132     has been compiled at that level. Thus, a test for both these conditions
4133     is necessary to ensure we correctly detect the start of the pattern in
4134     both phases.
4135 nigel 77
4136 nigel 93 If we are not at the pattern start, compile code to change the ims
4137     options if this setting actually changes any of them. We also pass the
4138     new setting back so that it can be put at the start of any following
4139     branches, and when this group ends (if we are in a group), a resetting
4140     item can be compiled. */
4141    
4142 nigel 77 if (*ptr == ')')
4143     {
4144 nigel 93 if (code == cd->start_code + 1 + LINK_SIZE &&
4145     (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4146 nigel 77 {
4147 nigel 93 cd->external_options = newoptions;
4148     options = newoptions;
4149 nigel 77 }
4150 nigel 93 else
4151     {
4152     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4153     {
4154     *code++ = OP_OPT;
4155     *code++ = newoptions & PCRE_IMS;
4156     }
4157 nigel 77
4158 nigel 93 /* Change options at this level, and pass them back for use
4159     in subsequent branches. Reset the greedy defa