/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 171 - (hide annotations) (download)
Mon Jun 4 14:28:58 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 175192 byte(s)
Support \k{name} and \g{name} a la Perl 5.10.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 ph10 168 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211 ph10 166 "(?R or (?[+-]digits must be followed by )",
212 nigel 77 /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 ph10 167 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 nigel 77 };
248    
249    
250     /* Table to identify digits and hex digits. This is used when compiling
251     patterns. Note that the tables in chartables are dependent on the locale, and
252     may mark arbitrary characters as digits - but the PCRE compiling code expects
253     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254     a private table here. It costs 256 bytes, but it is a lot faster than doing
255     character value tests (at least in some simple cases I timed), and in some
256     applications one wants PCRE to compile efficiently as well as match
257     efficiently.
258    
259     For convenience, we use the same bit definitions as in chartables:
260    
261     0x04 decimal digit
262     0x08 hexadecimal digit
263    
264     Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 nigel 77 static const unsigned char digitab[] =
268     {
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
303 nigel 77 static const unsigned char digitab[] =
304     {
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337    
338     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371     #endif
372    
373    
374     /* Definition to allow mutual recursion */
375    
376     static BOOL
377 nigel 93 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378     int *, branch_chain *, compile_data *, int *);
379 nigel 77
380    
381    
382     /*************************************************
383     * Handle escapes *
384     *************************************************/
385    
386     /* This function is called when a \ has been encountered. It either returns a
387     positive value for a simple escape such as \n, or a negative value which
388 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
389     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391     ptr is pointing at the \. On exit, it is on the final character of the escape
392     sequence.
393 nigel 77
394     Arguments:
395     ptrptr points to the pattern position pointer
396     errorcodeptr points to the errorcode variable
397     bracount number of previous extracting brackets
398     options the options bits
399     isclass TRUE if inside a character class
400    
401     Returns: zero or positive => a data character
402     negative => a special escape sequence
403     on error, errorptr is set
404     */
405    
406     static int
407     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408     int options, BOOL isclass)
409     {
410 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
411     const uschar *ptr = *ptrptr + 1;
412 nigel 77 int c, i;
413    
414 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415     ptr--; /* Set pointer back to the last byte */
416    
417 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
418    
419     if (c == 0) *errorcodeptr = ERR1;
420    
421     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422     a table. A non-zero result is something that can be returned immediately.
423     Otherwise further processing may be required. */
424    
425 ph10 97 #ifndef EBCDIC /* ASCII coding */
426 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
427     else if ((i = escapes[c - '0']) != 0) c = i;
428    
429 ph10 97 #else /* EBCDIC coding */
430 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431     else if ((i = escapes[c - 0x48]) != 0) c = i;
432     #endif
433    
434     /* Escapes that need further processing, or are illegal. */
435    
436     else
437     {
438     const uschar *oldptr;
439 nigel 93 BOOL braced, negated;
440    
441 nigel 77 switch (c)
442     {
443     /* A number of Perl escapes are not handled by PCRE. We give an explicit
444     error. */
445    
446     case 'l':
447     case 'L':
448     case 'N':
449     case 'u':
450     case 'U':
451     *errorcodeptr = ERR37;
452     break;
453    
454 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
455     is an absolute backreference. If negative, it is a relative backreference.
456 ph10 171 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457     reference to a named group. This is part of Perl's movement towards a
458     unified syntax for back references. As this is synonymous with \k{name}, we
459     fudge it up by pretending it really was \k. */
460 nigel 93
461     case 'g':
462     if (ptr[1] == '{')
463     {
464 ph10 171 const uschar *p;
465     for (p = ptr+2; *p != 0 && *p != '}'; p++)
466     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467     if (*p != 0 && *p != '}')
468     {
469     c = -ESC_k;
470     break;
471     }
472 nigel 93 braced = TRUE;
473     ptr++;
474     }
475     else braced = FALSE;
476    
477     if (ptr[1] == '-')
478     {
479     negated = TRUE;
480     ptr++;
481     }
482     else negated = FALSE;
483    
484     c = 0;
485     while ((digitab[ptr[1]] & ctype_digit) != 0)
486     c = c * 10 + *(++ptr) - '0';
487    
488     if (c == 0 || (braced && *(++ptr) != '}'))
489     {
490     *errorcodeptr = ERR57;
491     return 0;
492     }
493    
494     if (negated)
495     {
496     if (c > bracount)
497     {
498     *errorcodeptr = ERR15;
499     return 0;
500     }
501     c = bracount - (c - 1);
502     }
503    
504     c = -(ESC_REF + c);
505     break;
506    
507 nigel 77 /* The handling of escape sequences consisting of a string of digits
508     starting with one that is not zero is not straightforward. By experiment,
509     the way Perl works seems to be as follows:
510    
511     Outside a character class, the digits are read as a decimal number. If the
512     number is less than 10, or if there are that many previous extracting
513     left brackets, then it is a back reference. Otherwise, up to three octal
514     digits are read to form an escaped byte. Thus \123 is likely to be octal
515     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516     value is greater than 377, the least significant 8 bits are taken. Inside a
517     character class, \ followed by a digit is always an octal number. */
518    
519     case '1': case '2': case '3': case '4': case '5':
520     case '6': case '7': case '8': case '9':
521    
522     if (!isclass)
523     {
524     oldptr = ptr;
525     c -= '0';
526     while ((digitab[ptr[1]] & ctype_digit) != 0)
527     c = c * 10 + *(++ptr) - '0';
528     if (c < 10 || c <= bracount)
529     {
530     c = -(ESC_REF + c);
531     break;
532     }
533     ptr = oldptr; /* Put the pointer back and fall through */
534     }
535    
536     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537     generates a binary zero byte and treats the digit as a following literal.
538     Thus we have to pull back the pointer by one. */
539    
540     if ((c = *ptr) >= '8')
541     {
542     ptr--;
543     c = 0;
544     break;
545     }
546    
547     /* \0 always starts an octal number, but we may drop through to here with a
548 nigel 91 larger first octal digit. The original code used just to take the least
549     significant 8 bits of octal numbers (I think this is what early Perls used
550     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551     than 3 octal digits. */
552 nigel 77
553     case '0':
554     c -= '0';
555     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556     c = c * 8 + *(++ptr) - '0';
557 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 nigel 77 break;
559    
560 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
561     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562     treated as a data character. */
563 nigel 77
564     case 'x':
565 nigel 87 if (ptr[1] == '{')
566 nigel 77 {
567     const uschar *pt = ptr + 2;
568 nigel 87 int count = 0;
569    
570 nigel 77 c = 0;
571     while ((digitab[*pt] & ctype_xdigit) != 0)
572     {
573 nigel 87 register int cc = *pt++;
574     if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 nigel 77 count++;
576 nigel 87
577 ph10 97 #ifndef EBCDIC /* ASCII coding */
578 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 ph10 97 #else /* EBCDIC coding */
581 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 nigel 77 #endif
584     }
585 nigel 87
586 nigel 77 if (*pt == '}')
587     {
588 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 nigel 77 ptr = pt;
590     break;
591     }
592 nigel 87
593 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
594     recognize this construct; fall through to the normal \x handling. */
595     }
596    
597 nigel 87 /* Read just a single-byte hex-defined char */
598 nigel 77
599     c = 0;
600     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601     {
602     int cc; /* Some compilers don't like ++ */
603     cc = *(++ptr); /* in initializers */
604 ph10 97 #ifndef EBCDIC /* ASCII coding */
605 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 ph10 97 #else /* EBCDIC coding */
608 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
609     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610     #endif
611     }
612     break;
613    
614 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615     This coding is ASCII-specific, but then the whole concept of \cx is
616     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617 nigel 77
618     case 'c':
619     c = *(++ptr);
620     if (c == 0)
621     {
622     *errorcodeptr = ERR2;
623     return 0;
624     }
625    
626 ph10 97 #ifndef EBCDIC /* ASCII coding */
627 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
628     c ^= 0x40;
629 ph10 97 #else /* EBCDIC coding */
630 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
631     c ^= 0xC0;
632     #endif
633     break;
634    
635     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637     for Perl compatibility, it is a literal. This code looks a bit odd, but
638     there used to be some cases other than the default, and there may be again
639     in future, so I haven't "optimized" it. */
640    
641     default:
642     if ((options & PCRE_EXTRA) != 0) switch(c)
643     {
644     default:
645     *errorcodeptr = ERR3;
646     break;
647     }
648     break;
649     }
650     }
651    
652     *ptrptr = ptr;
653     return c;
654     }
655    
656    
657    
658     #ifdef SUPPORT_UCP
659     /*************************************************
660     * Handle \P and \p *
661     *************************************************/
662    
663     /* This function is called after \P or \p has been encountered, provided that
664     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665     pointing at the P or p. On exit, it is pointing at the final character of the
666     escape sequence.
667    
668     Argument:
669     ptrptr points to the pattern position pointer
670     negptr points to a boolean that is set TRUE for negation else FALSE
671 nigel 87 dptr points to an int that is set to the detailed property value
672 nigel 77 errorcodeptr points to the error code variable
673    
674 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
675 nigel 77 */
676    
677     static int
678 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 nigel 77 {
680     int c, i, bot, top;
681     const uschar *ptr = *ptrptr;
682 nigel 87 char name[32];
683 nigel 77
684     c = *(++ptr);
685     if (c == 0) goto ERROR_RETURN;
686    
687     *negptr = FALSE;
688    
689 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690     negation. */
691 nigel 77
692     if (c == '{')
693     {
694     if (ptr[1] == '^')
695     {
696     *negptr = TRUE;
697     ptr++;
698     }
699 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
700 nigel 77 {
701     c = *(++ptr);
702     if (c == 0) goto ERROR_RETURN;
703     if (c == '}') break;
704     name[i] = c;
705     }
706 nigel 87 if (c !='}') goto ERROR_RETURN;
707 nigel 77 name[i] = 0;
708     }
709    
710     /* Otherwise there is just one following character */
711    
712     else
713     {
714     name[0] = c;
715     name[1] = 0;
716     }
717    
718     *ptrptr = ptr;
719    
720     /* Search for a recognized property name using binary chop */
721    
722     bot = 0;
723     top = _pcre_utt_size;
724    
725     while (bot < top)
726     {
727 nigel 87 i = (bot + top) >> 1;
728 nigel 77 c = strcmp(name, _pcre_utt[i].name);
729 nigel 87 if (c == 0)
730     {
731     *dptr = _pcre_utt[i].value;
732     return _pcre_utt[i].type;
733     }
734 nigel 77 if (c > 0) bot = i + 1; else top = i;
735     }
736    
737     *errorcodeptr = ERR47;
738     *ptrptr = ptr;
739     return -1;
740    
741     ERROR_RETURN:
742     *errorcodeptr = ERR46;
743     *ptrptr = ptr;
744     return -1;
745     }
746     #endif
747    
748    
749    
750    
751     /*************************************************
752     * Check for counted repeat *
753     *************************************************/
754    
755     /* This function is called when a '{' is encountered in a place where it might
756     start a quantifier. It looks ahead to see if it really is a quantifier or not.
757     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758     where the ddds are digits.
759    
760     Arguments:
761     p pointer to the first char after '{'
762    
763     Returns: TRUE or FALSE
764     */
765    
766     static BOOL
767     is_counted_repeat(const uschar *p)
768     {
769     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770     while ((digitab[*p] & ctype_digit) != 0) p++;
771     if (*p == '}') return TRUE;
772    
773     if (*p++ != ',') return FALSE;
774     if (*p == '}') return TRUE;
775    
776     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777     while ((digitab[*p] & ctype_digit) != 0) p++;
778    
779     return (*p == '}');
780     }
781    
782    
783    
784     /*************************************************
785     * Read repeat counts *
786     *************************************************/
787    
788     /* Read an item of the form {n,m} and return the values. This is called only
789     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790     so the syntax is guaranteed to be correct, but we need to check the values.
791    
792     Arguments:
793     p pointer to first char after '{'
794     minp pointer to int for min
795     maxp pointer to int for max
796     returned as -1 if no max
797     errorcodeptr points to error code variable
798    
799     Returns: pointer to '}' on success;
800     current ptr on error, with errorcodeptr set non-zero
801     */
802    
803     static const uschar *
804     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805     {
806     int min = 0;
807     int max = -1;
808    
809 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
810     an integer overflow. */
811    
812 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 nigel 81 if (min < 0 || min > 65535)
814     {
815     *errorcodeptr = ERR5;
816     return p;
817     }
818 nigel 77
819 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
820     Also, max must not be less than min. */
821    
822 nigel 77 if (*p == '}') max = min; else
823     {
824     if (*(++p) != '}')
825     {
826     max = 0;
827     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 nigel 81 if (max < 0 || max > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77 if (max < min)
834     {
835     *errorcodeptr = ERR4;
836     return p;
837     }
838     }
839     }
840    
841 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
842     '}'. */
843 nigel 77
844 nigel 81 *minp = min;
845     *maxp = max;
846 nigel 77 return p;
847     }
848    
849    
850    
851     /*************************************************
852 nigel 93 * Find forward referenced subpattern *
853 nigel 91 *************************************************/
854    
855 nigel 93 /* This function scans along a pattern's text looking for capturing
856     subpatterns, and counting them. If it finds a named pattern that matches the
857     name it is given, it returns its number. Alternatively, if the name is NULL, it
858     returns when it reaches a given numbered subpattern. This is used for forward
859     references to subpatterns. We know that if (?P< is encountered, the name will
860     be terminated by '>' because that is checked in the first pass.
861 nigel 91
862     Arguments:
863 nigel 93 ptr current position in the pattern
864     count current count of capturing parens so far encountered
865     name name to seek, or NULL if seeking a numbered subpattern
866     lorn name length, or subpattern number if name is NULL
867     xmode TRUE if we are in /x mode
868 nigel 91
869     Returns: the number of the named subpattern, or -1 if not found
870     */
871    
872     static int
873 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874     BOOL xmode)
875 nigel 91 {
876     const uschar *thisname;
877 nigel 93
878 nigel 91 for (; *ptr != 0; ptr++)
879     {
880 nigel 93 int term;
881    
882     /* Skip over backslashed characters and also entire \Q...\E */
883    
884     if (*ptr == '\\')
885     {
886     if (*(++ptr) == 0) return -1;
887     if (*ptr == 'Q') for (;;)
888     {
889     while (*(++ptr) != 0 && *ptr != '\\');
890     if (*ptr == 0) return -1;
891     if (*(++ptr) == 'E') break;
892     }
893     continue;
894     }
895    
896     /* Skip over character classes */
897    
898     if (*ptr == '[')
899     {
900     while (*(++ptr) != ']')
901     {
902     if (*ptr == '\\')
903     {
904     if (*(++ptr) == 0) return -1;
905     if (*ptr == 'Q') for (;;)
906     {
907     while (*(++ptr) != 0 && *ptr != '\\');
908     if (*ptr == 0) return -1;
909     if (*(++ptr) == 'E') break;
910     }
911     continue;
912     }
913     }
914     continue;
915     }
916    
917     /* Skip comments in /x mode */
918    
919     if (xmode && *ptr == '#')
920     {
921     while (*(++ptr) != 0 && *ptr != '\n');
922     if (*ptr == 0) return -1;
923     continue;
924     }
925    
926     /* An opening parens must now be a real metacharacter */
927    
928 nigel 91 if (*ptr != '(') continue;
929 nigel 93 if (ptr[1] != '?')
930     {
931     count++;
932     if (name == NULL && count == lorn) return count;
933     continue;
934     }
935    
936     ptr += 2;
937     if (*ptr == 'P') ptr++; /* Allow optional P */
938    
939     /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942     *ptr != '\'')
943     continue;
944    
945 nigel 91 count++;
946 nigel 93
947     if (name == NULL && count == lorn) return count;
948     term = *ptr++;
949     if (term == '<') term = '>';
950 nigel 91 thisname = ptr;
951 nigel 93 while (*ptr != term) ptr++;
952     if (name != NULL && lorn == ptr - thisname &&
953     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 nigel 91 return count;
955     }
956 nigel 93
957 nigel 91 return -1;
958     }
959    
960    
961    
962     /*************************************************
963 nigel 77 * Find first significant op code *
964     *************************************************/
965    
966     /* This is called by several functions that scan a compiled expression looking
967     for a fixed first character, or an anchoring op code etc. It skips over things
968     that do not influence this. For some calls, a change of option is important.
969     For some calls, it makes sense to skip negative forward and all backward
970     assertions, and also the \b assertion; for others it does not.
971    
972     Arguments:
973     code pointer to the start of the group
974     options pointer to external options
975     optbit the option bit whose changing is significant, or
976     zero if none are
977     skipassert TRUE if certain assertions are to be skipped
978    
979     Returns: pointer to the first significant opcode
980     */
981    
982     static const uschar*
983     first_significant_code(const uschar *code, int *options, int optbit,
984     BOOL skipassert)
985     {
986     for (;;)
987     {
988     switch ((int)*code)
989     {
990     case OP_OPT:
991     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992     *options = (int)code[1];
993     code += 2;
994     break;
995    
996     case OP_ASSERT_NOT:
997     case OP_ASSERTBACK:
998     case OP_ASSERTBACK_NOT:
999     if (!skipassert) return code;
1000     do code += GET(code, 1); while (*code == OP_ALT);
1001     code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     case OP_WORD_BOUNDARY:
1005     case OP_NOT_WORD_BOUNDARY:
1006     if (!skipassert) return code;
1007     /* Fall through */
1008    
1009     case OP_CALLOUT:
1010     case OP_CREF:
1011 nigel 93 case OP_RREF:
1012     case OP_DEF:
1013 nigel 77 code += _pcre_OP_lengths[*code];
1014     break;
1015    
1016     default:
1017     return code;
1018     }
1019     }
1020     /* Control never reaches here */
1021     }
1022    
1023    
1024    
1025    
1026     /*************************************************
1027     * Find the fixed length of a pattern *
1028     *************************************************/
1029    
1030     /* Scan a pattern and compute the fixed length of subject that will match it,
1031     if the length is fixed. This is needed for dealing with backward assertions.
1032     In UTF8 mode, the result is in characters rather than bytes.
1033    
1034     Arguments:
1035     code points to the start of the pattern (the bracket)
1036     options the compiling options
1037    
1038     Returns: the fixed length, or -1 if there is no fixed length,
1039     or -2 if \C was encountered
1040     */
1041    
1042     static int
1043     find_fixedlength(uschar *code, int options)
1044     {
1045     int length = -1;
1046    
1047     register int branchlength = 0;
1048     register uschar *cc = code + 1 + LINK_SIZE;
1049    
1050     /* Scan along the opcodes for this branch. If we get to the end of the
1051     branch, check the length against that of the other branches. */
1052    
1053     for (;;)
1054     {
1055     int d;
1056     register int op = *cc;
1057    
1058     switch (op)
1059     {
1060 nigel 93 case OP_CBRA:
1061 nigel 77 case OP_BRA:
1062     case OP_ONCE:
1063     case OP_COND:
1064 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 nigel 77 if (d < 0) return d;
1066     branchlength += d;
1067     do cc += GET(cc, 1); while (*cc == OP_ALT);
1068     cc += 1 + LINK_SIZE;
1069     break;
1070    
1071     /* Reached end of a branch; if it's a ket it is the end of a nested
1072     call. If it's ALT it is an alternation in a nested call. If it is
1073     END it's the end of the outer call. All can be handled by the same code. */
1074    
1075     case OP_ALT:
1076     case OP_KET:
1077     case OP_KETRMAX:
1078     case OP_KETRMIN:
1079     case OP_END:
1080     if (length < 0) length = branchlength;
1081     else if (length != branchlength) return -1;
1082     if (*cc != OP_ALT) return length;
1083     cc += 1 + LINK_SIZE;
1084     branchlength = 0;
1085     break;
1086    
1087     /* Skip over assertive subpatterns */
1088    
1089     case OP_ASSERT:
1090     case OP_ASSERT_NOT:
1091     case OP_ASSERTBACK:
1092     case OP_ASSERTBACK_NOT:
1093     do cc += GET(cc, 1); while (*cc == OP_ALT);
1094     /* Fall through */
1095    
1096     /* Skip over things that don't match chars */
1097    
1098     case OP_REVERSE:
1099     case OP_CREF:
1100 nigel 93 case OP_RREF:
1101     case OP_DEF:
1102 nigel 77 case OP_OPT:
1103     case OP_CALLOUT:
1104     case OP_SOD:
1105     case OP_SOM:
1106     case OP_EOD:
1107     case OP_EODN:
1108     case OP_CIRC:
1109     case OP_DOLL:
1110     case OP_NOT_WORD_BOUNDARY:
1111     case OP_WORD_BOUNDARY:
1112     cc += _pcre_OP_lengths[*cc];
1113     break;
1114    
1115     /* Handle literal characters */
1116    
1117     case OP_CHAR:
1118     case OP_CHARNC:
1119 nigel 91 case OP_NOT:
1120 nigel 77 branchlength++;
1121     cc += 2;
1122     #ifdef SUPPORT_UTF8
1123     if ((options & PCRE_UTF8) != 0)
1124     {
1125     while ((*cc & 0xc0) == 0x80) cc++;
1126     }
1127     #endif
1128     break;
1129    
1130     /* Handle exact repetitions. The count is already in characters, but we
1131     need to skip over a multibyte character in UTF8 mode. */
1132    
1133     case OP_EXACT:
1134     branchlength += GET2(cc,1);
1135     cc += 4;
1136     #ifdef SUPPORT_UTF8
1137     if ((options & PCRE_UTF8) != 0)
1138     {
1139     while((*cc & 0x80) == 0x80) cc++;
1140     }
1141     #endif
1142     break;
1143    
1144     case OP_TYPEEXACT:
1145     branchlength += GET2(cc,1);
1146     cc += 4;
1147     break;
1148    
1149     /* Handle single-char matchers */
1150    
1151     case OP_PROP:
1152     case OP_NOTPROP:
1153 nigel 87 cc += 2;
1154 nigel 77 /* Fall through */
1155    
1156     case OP_NOT_DIGIT:
1157     case OP_DIGIT:
1158     case OP_NOT_WHITESPACE:
1159     case OP_WHITESPACE:
1160     case OP_NOT_WORDCHAR:
1161     case OP_WORDCHAR:
1162     case OP_ANY:
1163     branchlength++;
1164     cc++;
1165     break;
1166    
1167     /* The single-byte matcher isn't allowed */
1168    
1169     case OP_ANYBYTE:
1170     return -2;
1171    
1172     /* Check a class for variable quantification */
1173    
1174     #ifdef SUPPORT_UTF8
1175     case OP_XCLASS:
1176     cc += GET(cc, 1) - 33;
1177     /* Fall through */
1178     #endif
1179    
1180     case OP_CLASS:
1181     case OP_NCLASS:
1182     cc += 33;
1183    
1184     switch (*cc)
1185     {
1186     case OP_CRSTAR:
1187     case OP_CRMINSTAR:
1188     case OP_CRQUERY:
1189     case OP_CRMINQUERY:
1190     return -1;
1191    
1192     case OP_CRRANGE:
1193     case OP_CRMINRANGE:
1194     if (GET2(cc,1) != GET2(cc,3)) return -1;
1195     branchlength += GET2(cc,1);
1196     cc += 5;
1197     break;
1198    
1199     default:
1200     branchlength++;
1201     }
1202     break;
1203    
1204     /* Anything else is variable length */
1205    
1206     default:
1207     return -1;
1208     }
1209     }
1210     /* Control never gets here */
1211     }
1212    
1213    
1214    
1215    
1216     /*************************************************
1217     * Scan compiled regex for numbered bracket *
1218     *************************************************/
1219    
1220     /* This little function scans through a compiled pattern until it finds a
1221     capturing bracket with the given number.
1222    
1223     Arguments:
1224     code points to start of expression
1225     utf8 TRUE in UTF-8 mode
1226     number the required bracket number
1227    
1228     Returns: pointer to the opcode for the bracket, or NULL if not found
1229     */
1230    
1231     static const uschar *
1232     find_bracket(const uschar *code, BOOL utf8, int number)
1233     {
1234     for (;;)
1235     {
1236     register int c = *code;
1237     if (c == OP_END) return NULL;
1238 nigel 91
1239     /* XCLASS is used for classes that cannot be represented just by a bit
1240     map. This includes negated single high-valued characters. The length in
1241     the table is zero; the actual length is stored in the compiled code. */
1242    
1243     if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245 nigel 93 /* Handle capturing bracket */
1246 nigel 91
1247 nigel 93 else if (c == OP_CBRA)
1248 nigel 77 {
1249 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1250 nigel 77 if (n == number) return (uschar *)code;
1251 nigel 93 code += _pcre_OP_lengths[c];
1252 nigel 77 }
1253 nigel 91
1254 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255     a multi-byte character. The length in the table is a minimum, so we have to
1256     arrange to skip the extra bytes. */
1257 nigel 91
1258 nigel 77 else
1259     {
1260     code += _pcre_OP_lengths[c];
1261 ph10 107 #ifdef SUPPORT_UTF8
1262 nigel 77 if (utf8) switch(c)
1263     {
1264     case OP_CHAR:
1265     case OP_CHARNC:
1266     case OP_EXACT:
1267     case OP_UPTO:
1268     case OP_MINUPTO:
1269 nigel 93 case OP_POSUPTO:
1270 nigel 77 case OP_STAR:
1271     case OP_MINSTAR:
1272 nigel 93 case OP_POSSTAR:
1273 nigel 77 case OP_PLUS:
1274     case OP_MINPLUS:
1275 nigel 93 case OP_POSPLUS:
1276 nigel 77 case OP_QUERY:
1277     case OP_MINQUERY:
1278 nigel 93 case OP_POSQUERY:
1279     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 nigel 77 break;
1281     }
1282 ph10 111 #endif
1283 nigel 77 }
1284     }
1285     }
1286    
1287    
1288    
1289     /*************************************************
1290     * Scan compiled regex for recursion reference *
1291     *************************************************/
1292    
1293     /* This little function scans through a compiled pattern until it finds an
1294     instance of OP_RECURSE.
1295    
1296     Arguments:
1297     code points to start of expression
1298     utf8 TRUE in UTF-8 mode
1299    
1300     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301     */
1302    
1303     static const uschar *
1304     find_recurse(const uschar *code, BOOL utf8)
1305     {
1306     for (;;)
1307     {
1308     register int c = *code;
1309     if (c == OP_END) return NULL;
1310 nigel 91 if (c == OP_RECURSE) return code;
1311    
1312     /* XCLASS is used for classes that cannot be represented just by a bit
1313     map. This includes negated single high-valued characters. The length in
1314     the table is zero; the actual length is stored in the compiled code. */
1315    
1316     if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319     that are followed by a character may be followed by a multi-byte character.
1320 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1321     bytes. */
1322 nigel 91
1323 nigel 77 else
1324     {
1325     code += _pcre_OP_lengths[c];
1326 ph10 107 #ifdef SUPPORT_UTF8
1327 nigel 77 if (utf8) switch(c)
1328     {
1329     case OP_CHAR:
1330     case OP_CHARNC:
1331     case OP_EXACT:
1332     case OP_UPTO:
1333     case OP_MINUPTO:
1334 nigel 93 case OP_POSUPTO:
1335 nigel 77 case OP_STAR:
1336     case OP_MINSTAR:
1337 nigel 93 case OP_POSSTAR:
1338 nigel 77 case OP_PLUS:
1339     case OP_MINPLUS:
1340 nigel 93 case OP_POSPLUS:
1341 nigel 77 case OP_QUERY:
1342     case OP_MINQUERY:
1343 nigel 93 case OP_POSQUERY:
1344     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 nigel 77 break;
1346     }
1347 ph10 111 #endif
1348 nigel 77 }
1349     }
1350     }
1351    
1352    
1353    
1354     /*************************************************
1355     * Scan compiled branch for non-emptiness *
1356     *************************************************/
1357    
1358     /* This function scans through a branch of a compiled pattern to see whether it
1359 nigel 93 can match the empty string or not. It is called from could_be_empty()
1360     below and from compile_branch() when checking for an unlimited repeat of a
1361     group that can match nothing. Note that first_significant_code() skips over
1362     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363     struck an inner bracket whose current branch will already have been scanned.
1364 nigel 77
1365     Arguments:
1366     code points to start of search
1367     endcode points to where to stop
1368     utf8 TRUE if in UTF8 mode
1369    
1370     Returns: TRUE if what is matched could be empty
1371     */
1372    
1373     static BOOL
1374     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375     {
1376     register int c;
1377 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 nigel 77 code < endcode;
1379     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380     {
1381     const uschar *ccode;
1382    
1383     c = *code;
1384 ph10 170
1385     /* Groups with zero repeats can of course be empty; skip them. */
1386 nigel 77
1387 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388     {
1389     do code += GET(code, 1); while (*code == OP_ALT);
1390     c = *code;
1391     continue;
1392     }
1393    
1394     /* For other groups, scan the branches. */
1395    
1396 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1397 nigel 77 {
1398     BOOL empty_branch;
1399     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1400    
1401     /* Scan a closed bracket */
1402    
1403     empty_branch = FALSE;
1404     do
1405     {
1406     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1407     empty_branch = TRUE;
1408     code += GET(code, 1);
1409     }
1410     while (*code == OP_ALT);
1411     if (!empty_branch) return FALSE; /* All branches are non-empty */
1412 ph10 170 c = *code;
1413 nigel 93 continue;
1414 nigel 77 }
1415    
1416 nigel 93 /* Handle the other opcodes */
1417    
1418     switch (c)
1419 nigel 77 {
1420     /* Check for quantifiers after a class */
1421    
1422     #ifdef SUPPORT_UTF8
1423     case OP_XCLASS:
1424     ccode = code + GET(code, 1);
1425     goto CHECK_CLASS_REPEAT;
1426     #endif
1427    
1428     case OP_CLASS:
1429     case OP_NCLASS:
1430     ccode = code + 33;
1431    
1432     #ifdef SUPPORT_UTF8
1433     CHECK_CLASS_REPEAT:
1434     #endif
1435    
1436     switch (*ccode)
1437     {
1438     case OP_CRSTAR: /* These could be empty; continue */
1439     case OP_CRMINSTAR:
1440     case OP_CRQUERY:
1441     case OP_CRMINQUERY:
1442     break;
1443    
1444     default: /* Non-repeat => class must match */
1445     case OP_CRPLUS: /* These repeats aren't empty */
1446     case OP_CRMINPLUS:
1447     return FALSE;
1448    
1449     case OP_CRRANGE:
1450     case OP_CRMINRANGE:
1451     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1452     break;
1453     }
1454     break;
1455    
1456     /* Opcodes that must match a character */
1457    
1458     case OP_PROP:
1459     case OP_NOTPROP:
1460     case OP_EXTUNI:
1461     case OP_NOT_DIGIT:
1462     case OP_DIGIT:
1463     case OP_NOT_WHITESPACE:
1464     case OP_WHITESPACE:
1465     case OP_NOT_WORDCHAR:
1466     case OP_WORDCHAR:
1467     case OP_ANY:
1468     case OP_ANYBYTE:
1469     case OP_CHAR:
1470     case OP_CHARNC:
1471     case OP_NOT:
1472     case OP_PLUS:
1473     case OP_MINPLUS:
1474 nigel 93 case OP_POSPLUS:
1475 nigel 77 case OP_EXACT:
1476     case OP_NOTPLUS:
1477     case OP_NOTMINPLUS:
1478 nigel 93 case OP_NOTPOSPLUS:
1479 nigel 77 case OP_NOTEXACT:
1480     case OP_TYPEPLUS:
1481     case OP_TYPEMINPLUS:
1482 nigel 93 case OP_TYPEPOSPLUS:
1483 nigel 77 case OP_TYPEEXACT:
1484     return FALSE;
1485    
1486     /* End of branch */
1487    
1488     case OP_KET:
1489     case OP_KETRMAX:
1490     case OP_KETRMIN:
1491     case OP_ALT:
1492     return TRUE;
1493    
1494 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1495     MINUPTO, and POSUPTO may be followed by a multibyte character */
1496 nigel 77
1497     #ifdef SUPPORT_UTF8
1498     case OP_STAR:
1499     case OP_MINSTAR:
1500 nigel 93 case OP_POSSTAR:
1501 nigel 77 case OP_QUERY:
1502     case OP_MINQUERY:
1503 nigel 93 case OP_POSQUERY:
1504 nigel 77 case OP_UPTO:
1505     case OP_MINUPTO:
1506 nigel 93 case OP_POSUPTO:
1507 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1508     break;
1509     #endif
1510     }
1511     }
1512    
1513     return TRUE;
1514     }
1515    
1516    
1517    
1518     /*************************************************
1519     * Scan compiled regex for non-emptiness *
1520     *************************************************/
1521    
1522     /* This function is called to check for left recursive calls. We want to check
1523     the current branch of the current pattern to see if it could match the empty
1524     string. If it could, we must look outwards for branches at other levels,
1525     stopping when we pass beyond the bracket which is the subject of the recursion.
1526    
1527     Arguments:
1528     code points to start of the recursion
1529     endcode points to where to stop (current RECURSE item)
1530     bcptr points to the chain of current (unclosed) branch starts
1531     utf8 TRUE if in UTF-8 mode
1532    
1533     Returns: TRUE if what is matched could be empty
1534     */
1535    
1536     static BOOL
1537     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1538     BOOL utf8)
1539     {
1540     while (bcptr != NULL && bcptr->current >= code)
1541     {
1542     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1543     bcptr = bcptr->outer;
1544     }
1545     return TRUE;
1546     }
1547    
1548    
1549    
1550     /*************************************************
1551     * Check for POSIX class syntax *
1552     *************************************************/
1553    
1554     /* This function is called when the sequence "[:" or "[." or "[=" is
1555     encountered in a character class. It checks whether this is followed by an
1556     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1557     ".]" or "=]".
1558    
1559     Argument:
1560     ptr pointer to the initial [
1561     endptr where to return the end pointer
1562     cd pointer to compile data
1563    
1564     Returns: TRUE or FALSE
1565     */
1566    
1567     static BOOL
1568     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1569     {
1570     int terminator; /* Don't combine these lines; the Solaris cc */
1571     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1572     if (*(++ptr) == '^') ptr++;
1573     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1574     if (*ptr == terminator && ptr[1] == ']')
1575     {
1576     *endptr = ptr;
1577     return TRUE;
1578     }
1579     return FALSE;
1580     }
1581    
1582    
1583    
1584    
1585     /*************************************************
1586     * Check POSIX class name *
1587     *************************************************/
1588    
1589     /* This function is called to check the name given in a POSIX-style class entry
1590     such as [:alnum:].
1591    
1592     Arguments:
1593     ptr points to the first letter
1594     len the length of the name
1595    
1596     Returns: a value representing the name, or -1 if unknown
1597     */
1598    
1599     static int
1600     check_posix_name(const uschar *ptr, int len)
1601     {
1602     register int yield = 0;
1603     while (posix_name_lengths[yield] != 0)
1604     {
1605     if (len == posix_name_lengths[yield] &&
1606     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1607     yield++;
1608     }
1609     return -1;
1610     }
1611    
1612    
1613     /*************************************************
1614     * Adjust OP_RECURSE items in repeated group *
1615     *************************************************/
1616    
1617     /* OP_RECURSE items contain an offset from the start of the regex to the group
1618     that is referenced. This means that groups can be replicated for fixed
1619     repetition simply by copying (because the recursion is allowed to refer to
1620     earlier groups that are outside the current group). However, when a group is
1621     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1622     it, after it has been compiled. This means that any OP_RECURSE items within it
1623     that refer to the group itself or any contained groups have to have their
1624 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1625     the partially compiled regex must be temporarily terminated with OP_END.
1626 nigel 77
1627 nigel 93 This function has been extended with the possibility of forward references for
1628     recursions and subroutine calls. It must also check the list of such references
1629     for the group we are dealing with. If it finds that one of the recursions in
1630     the current group is on this list, it adjusts the offset in the list, not the
1631     value in the reference (which is a group number).
1632    
1633 nigel 77 Arguments:
1634     group points to the start of the group
1635     adjust the amount by which the group is to be moved
1636     utf8 TRUE in UTF-8 mode
1637     cd contains pointers to tables etc.
1638 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1639 nigel 77
1640     Returns: nothing
1641     */
1642    
1643     static void
1644 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1645     uschar *save_hwm)
1646 nigel 77 {
1647     uschar *ptr = group;
1648     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1649     {
1650 nigel 93 int offset;
1651     uschar *hc;
1652    
1653     /* See if this recursion is on the forward reference list. If so, adjust the
1654     reference. */
1655    
1656     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1657     {
1658     offset = GET(hc, 0);
1659     if (cd->start_code + offset == ptr + 1)
1660     {
1661     PUT(hc, 0, offset + adjust);
1662     break;
1663     }
1664     }
1665    
1666     /* Otherwise, adjust the recursion offset if it's after the start of this
1667     group. */
1668    
1669     if (hc >= cd->hwm)
1670     {
1671     offset = GET(ptr, 1);
1672     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1673     }
1674    
1675 nigel 77 ptr += 1 + LINK_SIZE;
1676     }
1677     }
1678    
1679    
1680    
1681     /*************************************************
1682     * Insert an automatic callout point *
1683     *************************************************/
1684    
1685     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1686     callout points before each pattern item.
1687    
1688     Arguments:
1689     code current code pointer
1690     ptr current pattern pointer
1691     cd pointers to tables etc
1692    
1693     Returns: new code pointer
1694     */
1695    
1696     static uschar *
1697     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1698     {
1699     *code++ = OP_CALLOUT;
1700     *code++ = 255;
1701     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1702     PUT(code, LINK_SIZE, 0); /* Default length */
1703     return code + 2*LINK_SIZE;
1704     }
1705    
1706    
1707    
1708     /*************************************************
1709     * Complete a callout item *
1710     *************************************************/
1711    
1712     /* A callout item contains the length of the next item in the pattern, which
1713     we can't fill in till after we have reached the relevant point. This is used
1714     for both automatic and manual callouts.
1715    
1716     Arguments:
1717     previous_callout points to previous callout item
1718     ptr current pattern pointer
1719     cd pointers to tables etc
1720    
1721     Returns: nothing
1722     */
1723    
1724     static void
1725     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1726     {
1727     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1728     PUT(previous_callout, 2 + LINK_SIZE, length);
1729     }
1730    
1731    
1732    
1733     #ifdef SUPPORT_UCP
1734     /*************************************************
1735     * Get othercase range *
1736     *************************************************/
1737    
1738     /* This function is passed the start and end of a class range, in UTF-8 mode
1739     with UCP support. It searches up the characters, looking for internal ranges of
1740     characters in the "other" case. Each call returns the next one, updating the
1741     start address.
1742    
1743     Arguments:
1744     cptr points to starting character value; updated
1745     d end value
1746     ocptr where to put start of othercase range
1747     odptr where to put end of othercase range
1748    
1749     Yield: TRUE when range returned; FALSE when no more
1750     */
1751    
1752     static BOOL
1753 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1754     unsigned int *odptr)
1755 nigel 77 {
1756 nigel 93 unsigned int c, othercase, next;
1757 nigel 77
1758     for (c = *cptr; c <= d; c++)
1759 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1760 nigel 77
1761     if (c > d) return FALSE;
1762    
1763     *ocptr = othercase;
1764     next = othercase + 1;
1765    
1766     for (++c; c <= d; c++)
1767     {
1768 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1769 nigel 77 next++;
1770     }
1771    
1772     *odptr = next - 1;
1773     *cptr = c;
1774    
1775     return TRUE;
1776     }
1777     #endif /* SUPPORT_UCP */
1778    
1779    
1780 nigel 93
1781 nigel 77 /*************************************************
1782 nigel 93 * Check if auto-possessifying is possible *
1783     *************************************************/
1784    
1785     /* This function is called for unlimited repeats of certain items, to see
1786     whether the next thing could possibly match the repeated item. If not, it makes
1787     sense to automatically possessify the repeated item.
1788    
1789     Arguments:
1790     op_code the repeated op code
1791     this data for this item, depends on the opcode
1792     utf8 TRUE in UTF-8 mode
1793     utf8_char used for utf8 character bytes, NULL if not relevant
1794     ptr next character in pattern
1795     options options bits
1796     cd contains pointers to tables etc.
1797    
1798     Returns: TRUE if possessifying is wanted
1799     */
1800    
1801     static BOOL
1802     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1803     const uschar *ptr, int options, compile_data *cd)
1804     {
1805     int next;
1806    
1807     /* Skip whitespace and comments in extended mode */
1808    
1809     if ((options & PCRE_EXTENDED) != 0)
1810     {
1811     for (;;)
1812     {
1813     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1814     if (*ptr == '#')
1815     {
1816     while (*(++ptr) != 0)
1817     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1818     }
1819     else break;
1820     }
1821     }
1822    
1823     /* If the next item is one that we can handle, get its value. A non-negative
1824     value is a character, a negative value is an escape value. */
1825    
1826     if (*ptr == '\\')
1827     {
1828     int temperrorcode = 0;
1829     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1830     if (temperrorcode != 0) return FALSE;
1831     ptr++; /* Point after the escape sequence */
1832     }
1833    
1834     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1835     {
1836     #ifdef SUPPORT_UTF8
1837     if (utf8) { GETCHARINC(next, ptr); } else
1838     #endif
1839     next = *ptr++;
1840     }
1841    
1842     else return FALSE;
1843    
1844     /* Skip whitespace and comments in extended mode */
1845    
1846     if ((options & PCRE_EXTENDED) != 0)
1847     {
1848     for (;;)
1849     {
1850     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1851     if (*ptr == '#')
1852     {
1853     while (*(++ptr) != 0)
1854     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1855     }
1856     else break;
1857     }
1858     }
1859    
1860     /* If the next thing is itself optional, we have to give up. */
1861    
1862     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1863     return FALSE;
1864    
1865     /* Now compare the next item with the previous opcode. If the previous is a
1866     positive single character match, "item" either contains the character or, if
1867     "item" is greater than 127 in utf8 mode, the character's bytes are in
1868     utf8_char. */
1869    
1870    
1871     /* Handle cases when the next item is a character. */
1872    
1873     if (next >= 0) switch(op_code)
1874     {
1875     case OP_CHAR:
1876     #ifdef SUPPORT_UTF8
1877     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1878     #endif
1879     return item != next;
1880    
1881     /* For CHARNC (caseless character) we must check the other case. If we have
1882     Unicode property support, we can use it to test the other case of
1883     high-valued characters. */
1884    
1885     case OP_CHARNC:
1886     #ifdef SUPPORT_UTF8
1887     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1888     #endif
1889     if (item == next) return FALSE;
1890     #ifdef SUPPORT_UTF8
1891     if (utf8)
1892     {
1893     unsigned int othercase;
1894     if (next < 128) othercase = cd->fcc[next]; else
1895     #ifdef SUPPORT_UCP
1896     othercase = _pcre_ucp_othercase((unsigned int)next);
1897     #else
1898     othercase = NOTACHAR;
1899     #endif
1900     return (unsigned int)item != othercase;
1901     }
1902     else
1903     #endif /* SUPPORT_UTF8 */
1904     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1905    
1906     /* For OP_NOT, "item" must be a single-byte character. */
1907    
1908     case OP_NOT:
1909     if (next < 0) return FALSE; /* Not a character */
1910     if (item == next) return TRUE;
1911     if ((options & PCRE_CASELESS) == 0) return FALSE;
1912     #ifdef SUPPORT_UTF8
1913     if (utf8)
1914     {
1915     unsigned int othercase;
1916     if (next < 128) othercase = cd->fcc[next]; else
1917     #ifdef SUPPORT_UCP
1918     othercase = _pcre_ucp_othercase(next);
1919     #else
1920     othercase = NOTACHAR;
1921     #endif
1922     return (unsigned int)item == othercase;
1923     }
1924     else
1925     #endif /* SUPPORT_UTF8 */
1926     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1927    
1928     case OP_DIGIT:
1929     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1930    
1931     case OP_NOT_DIGIT:
1932     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1933    
1934     case OP_WHITESPACE:
1935     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1936    
1937     case OP_NOT_WHITESPACE:
1938     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1939    
1940     case OP_WORDCHAR:
1941     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1942    
1943     case OP_NOT_WORDCHAR:
1944     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1945    
1946     default:
1947     return FALSE;
1948     }
1949    
1950    
1951     /* Handle the case when the next item is \d, \s, etc. */
1952    
1953     switch(op_code)
1954     {
1955     case OP_CHAR:
1956     case OP_CHARNC:
1957     #ifdef SUPPORT_UTF8
1958     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1959     #endif
1960     switch(-next)
1961     {
1962     case ESC_d:
1963     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1964    
1965     case ESC_D:
1966     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1967    
1968     case ESC_s:
1969     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1970    
1971     case ESC_S:
1972     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1973    
1974     case ESC_w:
1975     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1976    
1977     case ESC_W:
1978     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1979    
1980     default:
1981     return FALSE;
1982     }
1983    
1984     case OP_DIGIT:
1985     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1986    
1987     case OP_NOT_DIGIT:
1988     return next == -ESC_d;
1989    
1990     case OP_WHITESPACE:
1991     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1992    
1993     case OP_NOT_WHITESPACE:
1994     return next == -ESC_s;
1995    
1996     case OP_WORDCHAR:
1997     return next == -ESC_W || next == -ESC_s;
1998    
1999     case OP_NOT_WORDCHAR:
2000     return next == -ESC_w || next == -ESC_d;
2001    
2002     default:
2003     return FALSE;
2004     }
2005    
2006     /* Control does not reach here */
2007     }
2008    
2009    
2010    
2011     /*************************************************
2012 nigel 77 * Compile one branch *
2013     *************************************************/
2014    
2015 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2016 nigel 77 changed during the branch, the pointer is used to change the external options
2017 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2018     to find out the amount of memory needed, as well as during the real compile
2019     phase. The value of lengthptr distinguishes the two phases.
2020 nigel 77
2021     Arguments:
2022     optionsptr pointer to the option bits
2023     codeptr points to the pointer to the current code point
2024     ptrptr points to the current pattern pointer
2025     errorcodeptr points to error code variable
2026     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2027     reqbyteptr set to the last literal character required, else < 0
2028     bcptr points to current branch chain
2029     cd contains pointers to tables etc.
2030 nigel 93 lengthptr NULL during the real compile phase
2031     points to length accumulator during pre-compile phase
2032 nigel 77
2033     Returns: TRUE on success
2034     FALSE, with *errorcodeptr set non-zero on error
2035     */
2036    
2037     static BOOL
2038 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2039     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2040     compile_data *cd, int *lengthptr)
2041 nigel 77 {
2042     int repeat_type, op_type;
2043     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2044     int bravalue = 0;
2045     int greedy_default, greedy_non_default;
2046     int firstbyte, reqbyte;
2047     int zeroreqbyte, zerofirstbyte;
2048     int req_caseopt, reqvary, tempreqvary;
2049     int options = *optionsptr;
2050     int after_manual_callout = 0;
2051 nigel 93 int length_prevgroup = 0;
2052 nigel 77 register int c;
2053     register uschar *code = *codeptr;
2054 nigel 93 uschar *last_code = code;
2055     uschar *orig_code = code;
2056 nigel 77 uschar *tempcode;
2057     BOOL inescq = FALSE;
2058     BOOL groupsetfirstbyte = FALSE;
2059     const uschar *ptr = *ptrptr;
2060     const uschar *tempptr;
2061     uschar *previous = NULL;
2062     uschar *previous_callout = NULL;
2063 nigel 93 uschar *save_hwm = NULL;
2064 nigel 77 uschar classbits[32];
2065    
2066     #ifdef SUPPORT_UTF8
2067     BOOL class_utf8;
2068     BOOL utf8 = (options & PCRE_UTF8) != 0;
2069     uschar *class_utf8data;
2070     uschar utf8_char[6];
2071     #else
2072     BOOL utf8 = FALSE;
2073 nigel 93 uschar *utf8_char = NULL;
2074 nigel 77 #endif
2075    
2076 nigel 93 #ifdef DEBUG
2077     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2078     #endif
2079    
2080 nigel 77 /* Set up the default and non-default settings for greediness */
2081    
2082     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2083     greedy_non_default = greedy_default ^ 1;
2084    
2085     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2086     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2087     matches a non-fixed char first char; reqbyte just remains unset if we never
2088     find one.
2089    
2090     When we hit a repeat whose minimum is zero, we may have to adjust these values
2091     to take the zero repeat into account. This is implemented by setting them to
2092     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2093     item types that can be repeated set these backoff variables appropriately. */
2094    
2095     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2096    
2097     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2098     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2099     value > 255. It is added into the firstbyte or reqbyte variables to record the
2100     case status of the value. This is used only for ASCII characters. */
2101    
2102     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2103    
2104     /* Switch on next character until the end of the branch */
2105    
2106     for (;; ptr++)
2107     {
2108     BOOL negate_class;
2109     BOOL possessive_quantifier;
2110     BOOL is_quantifier;
2111 nigel 93 BOOL is_recurse;
2112 nigel 77 int class_charcount;
2113     int class_lastchar;
2114     int newoptions;
2115     int recno;
2116 ph10 167 int refsign;
2117 nigel 77 int skipbytes;
2118     int subreqbyte;
2119     int subfirstbyte;
2120 nigel 93 int terminator;
2121 nigel 77 int mclength;
2122     uschar mcbuffer[8];
2123    
2124 nigel 93 /* Get next byte in the pattern */
2125 nigel 77
2126     c = *ptr;
2127    
2128 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2129     previous cycle of this loop. */
2130    
2131     if (lengthptr != NULL)
2132     {
2133     #ifdef DEBUG
2134     if (code > cd->hwm) cd->hwm = code; /* High water info */
2135     #endif
2136     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2137     {
2138     *errorcodeptr = ERR52;
2139     goto FAILED;
2140     }
2141    
2142     /* There is at least one situation where code goes backwards: this is the
2143     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2144     the class is simply eliminated. However, it is created first, so we have to
2145     allow memory for it. Therefore, don't ever reduce the length at this point.
2146     */
2147    
2148     if (code < last_code) code = last_code;
2149     *lengthptr += code - last_code;
2150     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2151    
2152     /* If "previous" is set and it is not at the start of the work space, move
2153     it back to there, in order to avoid filling up the work space. Otherwise,
2154     if "previous" is NULL, reset the current code pointer to the start. */
2155    
2156     if (previous != NULL)
2157     {
2158     if (previous > orig_code)
2159     {
2160     memmove(orig_code, previous, code - previous);
2161     code -= previous - orig_code;
2162     previous = orig_code;
2163     }
2164     }
2165     else code = orig_code;
2166    
2167     /* Remember where this code item starts so we can pick up the length
2168     next time round. */
2169    
2170     last_code = code;
2171     }
2172    
2173     /* In the real compile phase, just check the workspace used by the forward
2174     reference list. */
2175    
2176     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2177     {
2178     *errorcodeptr = ERR52;
2179     goto FAILED;
2180     }
2181    
2182 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2183    
2184     if (inescq && c != 0)
2185     {
2186     if (c == '\\' && ptr[1] == 'E')
2187     {
2188     inescq = FALSE;
2189     ptr++;
2190     continue;
2191     }
2192     else
2193     {
2194     if (previous_callout != NULL)
2195     {
2196 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2197     complete_callout(previous_callout, ptr, cd);
2198 nigel 77 previous_callout = NULL;
2199     }
2200     if ((options & PCRE_AUTO_CALLOUT) != 0)
2201     {
2202     previous_callout = code;
2203     code = auto_callout(code, ptr, cd);
2204     }
2205     goto NORMAL_CHAR;
2206     }
2207     }
2208    
2209     /* Fill in length of a previous callout, except when the next thing is
2210     a quantifier. */
2211    
2212     is_quantifier = c == '*' || c == '+' || c == '?' ||
2213     (c == '{' && is_counted_repeat(ptr+1));
2214    
2215     if (!is_quantifier && previous_callout != NULL &&
2216     after_manual_callout-- <= 0)
2217     {
2218 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2219     complete_callout(previous_callout, ptr, cd);
2220 nigel 77 previous_callout = NULL;
2221     }
2222    
2223     /* In extended mode, skip white space and comments */
2224    
2225     if ((options & PCRE_EXTENDED) != 0)
2226     {
2227     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2228     if (c == '#')
2229     {
2230 nigel 93 while (*(++ptr) != 0)
2231 nigel 91 {
2232 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2233 nigel 91 }
2234 nigel 93 if (*ptr != 0) continue;
2235    
2236 nigel 91 /* Else fall through to handle end of string */
2237     c = 0;
2238 nigel 77 }
2239     }
2240    
2241     /* No auto callout for quantifiers. */
2242    
2243     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2244     {
2245     previous_callout = code;
2246     code = auto_callout(code, ptr, cd);
2247     }
2248    
2249     switch(c)
2250     {
2251 nigel 93 /* ===================================================================*/
2252     case 0: /* The branch terminates at string end */
2253     case '|': /* or | or ) */
2254 nigel 77 case ')':
2255     *firstbyteptr = firstbyte;
2256     *reqbyteptr = reqbyte;
2257     *codeptr = code;
2258     *ptrptr = ptr;
2259 nigel 93 if (lengthptr != NULL)
2260     {
2261     *lengthptr += code - last_code; /* To include callout length */
2262     DPRINTF((">> end branch\n"));
2263     }
2264 nigel 77 return TRUE;
2265    
2266 nigel 93
2267     /* ===================================================================*/
2268 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2269     the setting of any following char as a first character. */
2270    
2271     case '^':
2272     if ((options & PCRE_MULTILINE) != 0)
2273     {
2274     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2275     }
2276     previous = NULL;
2277     *code++ = OP_CIRC;
2278     break;
2279    
2280     case '$':
2281     previous = NULL;
2282     *code++ = OP_DOLL;
2283     break;
2284    
2285     /* There can never be a first char if '.' is first, whatever happens about
2286     repeats. The value of reqbyte doesn't change either. */
2287    
2288     case '.':
2289     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2290     zerofirstbyte = firstbyte;
2291     zeroreqbyte = reqbyte;
2292     previous = code;
2293     *code++ = OP_ANY;
2294     break;
2295    
2296 nigel 93
2297     /* ===================================================================*/
2298 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2299     32-byte bitmap of the permitted characters, except in the special case
2300     where there is only one such character. For negated classes, we build the
2301     map as usual, then invert it at the end. However, we use a different opcode
2302     so that data characters > 255 can be handled correctly.
2303 nigel 77
2304     If the class contains characters outside the 0-255 range, a different
2305     opcode is compiled. It may optionally have a bit map for characters < 256,
2306     but those above are are explicitly listed afterwards. A flag byte tells
2307     whether the bitmap is present, and whether this is a negated class or not.
2308     */
2309    
2310     case '[':
2311     previous = code;
2312    
2313     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2314     they are encountered at the top level, so we'll do that too. */
2315    
2316     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2317     check_posix_syntax(ptr, &tempptr, cd))
2318     {
2319     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2320     goto FAILED;
2321     }
2322    
2323     /* If the first character is '^', set the negation flag and skip it. */
2324    
2325     if ((c = *(++ptr)) == '^')
2326     {
2327     negate_class = TRUE;
2328     c = *(++ptr);
2329     }
2330     else
2331     {
2332     negate_class = FALSE;
2333     }
2334    
2335     /* Keep a count of chars with values < 256 so that we can optimize the case
2336 nigel 93 of just a single character (as long as it's < 256). However, For higher
2337     valued UTF-8 characters, we don't yet do any optimization. */
2338 nigel 77
2339     class_charcount = 0;
2340     class_lastchar = -1;
2341    
2342 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2343     temporary bit of memory, in case the class contains only 1 character (less
2344     than 256), because in that case the compiled code doesn't use the bit map.
2345     */
2346    
2347     memset(classbits, 0, 32 * sizeof(uschar));
2348    
2349 nigel 77 #ifdef SUPPORT_UTF8
2350     class_utf8 = FALSE; /* No chars >= 256 */
2351 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2352 nigel 77 #endif
2353    
2354     /* Process characters until ] is reached. By writing this as a "do" it
2355 nigel 93 means that an initial ] is taken as a data character. At the start of the
2356     loop, c contains the first byte of the character. */
2357 nigel 77
2358 nigel 93 if (c != 0) do
2359 nigel 77 {
2360 nigel 93 const uschar *oldptr;
2361    
2362 nigel 77 #ifdef SUPPORT_UTF8
2363     if (utf8 && c > 127)
2364     { /* Braces are required because the */
2365     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2366     }
2367     #endif
2368    
2369     /* Inside \Q...\E everything is literal except \E */
2370    
2371     if (inescq)
2372     {
2373 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2374 nigel 77 {
2375 nigel 93 inescq = FALSE; /* Reset literal state */
2376     ptr++; /* Skip the 'E' */
2377     continue; /* Carry on with next */
2378 nigel 77 }
2379 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2380 nigel 77 }
2381    
2382     /* Handle POSIX class names. Perl allows a negation extension of the
2383     form [:^name:]. A square bracket that doesn't match the syntax is
2384     treated as a literal. We also recognize the POSIX constructions
2385     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2386     5.6 and 5.8 do. */
2387    
2388     if (c == '[' &&
2389     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2390     check_posix_syntax(ptr, &tempptr, cd))
2391     {
2392     BOOL local_negate = FALSE;
2393 nigel 87 int posix_class, taboffset, tabopt;
2394 nigel 77 register const uschar *cbits = cd->cbits;
2395 nigel 87 uschar pbits[32];
2396 nigel 77
2397     if (ptr[1] != ':')
2398     {
2399     *errorcodeptr = ERR31;
2400     goto FAILED;
2401     }
2402    
2403     ptr += 2;
2404     if (*ptr == '^')
2405     {
2406     local_negate = TRUE;
2407     ptr++;
2408     }
2409    
2410     posix_class = check_posix_name(ptr, tempptr - ptr);
2411     if (posix_class < 0)
2412     {
2413     *errorcodeptr = ERR30;
2414     goto FAILED;
2415     }
2416    
2417     /* If matching is caseless, upper and lower are converted to
2418     alpha. This relies on the fact that the class table starts with
2419     alpha, lower, upper as the first 3 entries. */
2420    
2421     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2422     posix_class = 0;
2423    
2424 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2425     because we may be adding and subtracting from it, and we don't want to
2426     subtract bits that may be in the main map already. At the end we or the
2427     result into the bit map that is being built. */
2428 nigel 77
2429     posix_class *= 3;
2430 nigel 87
2431     /* Copy in the first table (always present) */
2432    
2433     memcpy(pbits, cbits + posix_class_maps[posix_class],
2434     32 * sizeof(uschar));
2435    
2436     /* If there is a second table, add or remove it as required. */
2437    
2438     taboffset = posix_class_maps[posix_class + 1];
2439     tabopt = posix_class_maps[posix_class + 2];
2440    
2441     if (taboffset >= 0)
2442 nigel 77 {
2443 nigel 87 if (tabopt >= 0)
2444     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2445 nigel 77 else
2446 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2447 nigel 77 }
2448    
2449 nigel 87 /* Not see if we need to remove any special characters. An option
2450     value of 1 removes vertical space and 2 removes underscore. */
2451    
2452     if (tabopt < 0) tabopt = -tabopt;
2453     if (tabopt == 1) pbits[1] &= ~0x3c;
2454     else if (tabopt == 2) pbits[11] &= 0x7f;
2455    
2456     /* Add the POSIX table or its complement into the main table that is
2457     being built and we are done. */
2458    
2459     if (local_negate)
2460     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2461     else
2462     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2463    
2464 nigel 77 ptr = tempptr + 1;
2465     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2466     continue; /* End of POSIX syntax handling */
2467     }
2468    
2469     /* Backslash may introduce a single character, or it may introduce one
2470 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2471     case. Inside a class (and only there) it is treated as backspace.
2472     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2473     to or into the one we are building. We assume they have more than one
2474 nigel 77 character in them, so set class_charcount bigger than one. */
2475    
2476     if (c == '\\')
2477     {
2478 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2479     if (*errorcodeptr != 0) goto FAILED;
2480 nigel 77
2481     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2482     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2483 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2484 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2485     {
2486     if (ptr[1] == '\\' && ptr[2] == 'E')
2487     {
2488     ptr += 2; /* avoid empty string */
2489     }
2490     else inescq = TRUE;
2491     continue;
2492     }
2493    
2494     if (c < 0)
2495     {
2496     register const uschar *cbits = cd->cbits;
2497     class_charcount += 2; /* Greater than 1 is what matters */
2498 nigel 93
2499     /* Save time by not doing this in the pre-compile phase. */
2500    
2501     if (lengthptr == NULL) switch (-c)
2502 nigel 77 {
2503     case ESC_d:
2504     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2505     continue;
2506    
2507     case ESC_D:
2508     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2509     continue;
2510    
2511     case ESC_w:
2512     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2513     continue;
2514    
2515     case ESC_W:
2516     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2517     continue;
2518    
2519     case ESC_s:
2520     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2521     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2522     continue;
2523    
2524     case ESC_S:
2525     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2526     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2527     continue;
2528    
2529 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2530     continue;
2531    
2532     default: /* Not recognized; fall through */
2533     break; /* Need "default" setting to stop compiler warning. */
2534     }
2535    
2536     /* In the pre-compile phase, just do the recognition. */
2537    
2538     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2539     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2540    
2541     /* We need to deal with \P and \p in both phases. */
2542    
2543 nigel 77 #ifdef SUPPORT_UCP
2544 nigel 93 if (-c == ESC_p || -c == ESC_P)
2545     {
2546     BOOL negated;
2547     int pdata;
2548     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2549     if (ptype < 0) goto FAILED;
2550     class_utf8 = TRUE;
2551     *class_utf8data++ = ((-c == ESC_p) != negated)?
2552     XCL_PROP : XCL_NOTPROP;
2553     *class_utf8data++ = ptype;
2554     *class_utf8data++ = pdata;
2555     class_charcount -= 2; /* Not a < 256 character */
2556 nigel 77 continue;
2557 nigel 93 }
2558 nigel 77 #endif
2559 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2560     strict mode. By default, for compatibility with Perl, they are
2561     treated as literals. */
2562 nigel 77
2563 nigel 93 if ((options & PCRE_EXTRA) != 0)
2564     {
2565     *errorcodeptr = ERR7;
2566     goto FAILED;
2567     }
2568 nigel 77
2569 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2570     c = *ptr; /* Get the final character and fall through */
2571 nigel 77 }
2572    
2573     /* Fall through if we have a single character (c >= 0). This may be
2574 nigel 93 greater than 256 in UTF-8 mode. */
2575 nigel 77
2576     } /* End of backslash handling */
2577    
2578     /* A single character may be followed by '-' to form a range. However,
2579     Perl does not permit ']' to be the end of the range. A '-' character
2580 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2581     entirely. The code for handling \Q and \E is messy. */
2582 nigel 77
2583 nigel 93 CHECK_RANGE:
2584     while (ptr[1] == '\\' && ptr[2] == 'E')
2585 nigel 77 {
2586 nigel 93 inescq = FALSE;
2587     ptr += 2;
2588     }
2589    
2590     oldptr = ptr;
2591    
2592     if (!inescq && ptr[1] == '-')
2593     {
2594 nigel 77 int d;
2595     ptr += 2;
2596 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2597 nigel 77
2598 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2599     mode. */
2600    
2601     while (*ptr == '\\' && ptr[1] == 'Q')
2602     {
2603     ptr += 2;
2604     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2605     inescq = TRUE;
2606     break;
2607     }
2608    
2609     if (*ptr == 0 || (!inescq && *ptr == ']'))
2610     {
2611     ptr = oldptr;
2612     goto LONE_SINGLE_CHARACTER;
2613     }
2614    
2615 nigel 77 #ifdef SUPPORT_UTF8
2616     if (utf8)
2617     { /* Braces are required because the */
2618     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2619     }
2620     else
2621     #endif
2622     d = *ptr; /* Not UTF-8 mode */
2623    
2624     /* The second part of a range can be a single-character escape, but
2625     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2626     in such circumstances. */
2627    
2628 nigel 93 if (!inescq && d == '\\')
2629 nigel 77 {
2630 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2631     if (*errorcodeptr != 0) goto FAILED;
2632 nigel 77
2633 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2634     special means the '-' was literal */
2635 nigel 77
2636     if (d < 0)
2637     {
2638     if (d == -ESC_b) d = '\b';
2639 nigel 93 else if (d == -ESC_X) d = 'X';
2640     else if (d == -ESC_R) d = 'R'; else
2641 nigel 77 {
2642 nigel 93 ptr = oldptr;
2643 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2644     }
2645     }
2646     }
2647    
2648 nigel 93 /* Check that the two values are in the correct order. Optimize
2649     one-character ranges */
2650 nigel 77
2651 nigel 93 if (d < c)
2652     {
2653     *errorcodeptr = ERR8;
2654     goto FAILED;
2655     }
2656    
2657 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2658    
2659     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2660     matching, we have to use an XCLASS with extra data items. Caseless
2661     matching for characters > 127 is available only if UCP support is
2662     available. */
2663    
2664     #ifdef SUPPORT_UTF8
2665     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2666     {
2667     class_utf8 = TRUE;
2668    
2669     /* With UCP support, we can find the other case equivalents of
2670     the relevant characters. There may be several ranges. Optimize how
2671     they fit with the basic range. */
2672    
2673     #ifdef SUPPORT_UCP
2674     if ((options & PCRE_CASELESS) != 0)
2675     {
2676 nigel 93 unsigned int occ, ocd;
2677     unsigned int cc = c;
2678     unsigned int origd = d;
2679 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2680     {
2681     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2682    
2683     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2684     { /* if there is overlap, */
2685     c = occ; /* noting that if occ < c */
2686     continue; /* we can't have ocd > d */
2687     } /* because a subrange is */
2688     if (ocd > d && occ <= d + 1) /* always shorter than */
2689     { /* the basic range. */
2690     d = ocd;
2691     continue;
2692     }
2693    
2694     if (occ == ocd)
2695     {
2696     *class_utf8data++ = XCL_SINGLE;
2697     }
2698     else
2699     {
2700     *class_utf8data++ = XCL_RANGE;
2701     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2702     }
2703     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2704     }
2705     }
2706     #endif /* SUPPORT_UCP */
2707    
2708     /* Now record the original range, possibly modified for UCP caseless
2709     overlapping ranges. */
2710    
2711     *class_utf8data++ = XCL_RANGE;
2712     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2713     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2714    
2715     /* With UCP support, we are done. Without UCP support, there is no
2716     caseless matching for UTF-8 characters > 127; we can use the bit map
2717     for the smaller ones. */
2718    
2719     #ifdef SUPPORT_UCP
2720     continue; /* With next character in the class */
2721     #else
2722     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2723    
2724     /* Adjust upper limit and fall through to set up the map */
2725    
2726     d = 127;
2727    
2728     #endif /* SUPPORT_UCP */
2729     }
2730     #endif /* SUPPORT_UTF8 */
2731    
2732     /* We use the bit map for all cases when not in UTF-8 mode; else
2733     ranges that lie entirely within 0-127 when there is UCP support; else
2734     for partial ranges without UCP support. */
2735    
2736 nigel 93 class_charcount += d - c + 1;
2737     class_lastchar = d;
2738    
2739     /* We can save a bit of time by skipping this in the pre-compile. */
2740    
2741     if (lengthptr == NULL) for (; c <= d; c++)
2742 nigel 77 {
2743     classbits[c/8] |= (1 << (c&7));
2744     if ((options & PCRE_CASELESS) != 0)
2745     {
2746     int uc = cd->fcc[c]; /* flip case */
2747     classbits[uc/8] |= (1 << (uc&7));
2748     }
2749     }
2750    
2751     continue; /* Go get the next char in the class */
2752     }
2753    
2754     /* Handle a lone single character - we can get here for a normal
2755     non-escape char, or after \ that introduces a single character or for an
2756     apparent range that isn't. */
2757    
2758     LONE_SINGLE_CHARACTER:
2759    
2760     /* Handle a character that cannot go in the bit map */
2761    
2762     #ifdef SUPPORT_UTF8
2763     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2764     {
2765     class_utf8 = TRUE;
2766     *class_utf8data++ = XCL_SINGLE;
2767     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2768    
2769     #ifdef SUPPORT_UCP
2770     if ((options & PCRE_CASELESS) != 0)
2771     {
2772 nigel 93 unsigned int othercase;
2773     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2774 nigel 77 {
2775     *class_utf8data++ = XCL_SINGLE;
2776     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2777     }
2778     }
2779     #endif /* SUPPORT_UCP */
2780    
2781     }
2782     else
2783     #endif /* SUPPORT_UTF8 */
2784    
2785     /* Handle a single-byte character */
2786     {
2787     classbits[c/8] |= (1 << (c&7));
2788     if ((options & PCRE_CASELESS) != 0)
2789     {
2790     c = cd->fcc[c]; /* flip case */
2791     classbits[c/8] |= (1 << (c&7));
2792     }
2793     class_charcount++;
2794     class_lastchar = c;
2795     }
2796     }
2797    
2798 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2799 nigel 77
2800 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2801 nigel 77
2802 nigel 93 if (c == 0) /* Missing terminating ']' */
2803     {
2804     *errorcodeptr = ERR6;
2805     goto FAILED;
2806     }
2807    
2808 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2809     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2810     can optimize the negative case only if there were no characters >= 128
2811     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2812     single-bytes only. This is an historical hangover. Maybe one day we can
2813     tidy these opcodes to handle multi-byte characters.
2814    
2815     The optimization throws away the bit map. We turn the item into a
2816     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2817     that OP_NOT does not support multibyte characters. In the positive case, it
2818     can cause firstbyte to be set. Otherwise, there can be no first char if
2819     this item is first, whatever repeat count may follow. In the case of
2820     reqbyte, save the previous value for reinstating. */
2821    
2822     #ifdef SUPPORT_UTF8
2823     if (class_charcount == 1 &&
2824     (!utf8 ||
2825     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2826    
2827     #else
2828     if (class_charcount == 1)
2829     #endif
2830     {
2831     zeroreqbyte = reqbyte;
2832    
2833     /* The OP_NOT opcode works on one-byte characters only. */
2834    
2835     if (negate_class)
2836     {
2837     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2838     zerofirstbyte = firstbyte;
2839     *code++ = OP_NOT;
2840     *code++ = class_lastchar;
2841     break;
2842     }
2843    
2844     /* For a single, positive character, get the value into mcbuffer, and
2845     then we can handle this with the normal one-character code. */
2846    
2847     #ifdef SUPPORT_UTF8
2848     if (utf8 && class_lastchar > 127)
2849     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2850     else
2851     #endif
2852     {
2853     mcbuffer[0] = class_lastchar;
2854     mclength = 1;
2855     }
2856     goto ONE_CHAR;
2857     } /* End of 1-char optimization */
2858    
2859     /* The general case - not the one-char optimization. If this is the first
2860     thing in the branch, there can be no first char setting, whatever the
2861     repeat count. Any reqbyte setting must remain unchanged after any kind of
2862     repeat. */
2863    
2864     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2865     zerofirstbyte = firstbyte;
2866     zeroreqbyte = reqbyte;
2867    
2868     /* If there are characters with values > 255, we have to compile an
2869     extended class, with its own opcode. If there are no characters < 256,
2870 nigel 93 we can omit the bitmap in the actual compiled code. */
2871 nigel 77
2872     #ifdef SUPPORT_UTF8
2873     if (class_utf8)
2874     {
2875     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2876     *code++ = OP_XCLASS;
2877     code += LINK_SIZE;
2878     *code = negate_class? XCL_NOT : 0;
2879    
2880 nigel 93 /* If the map is required, move up the extra data to make room for it;
2881     otherwise just move the code pointer to the end of the extra data. */
2882 nigel 77
2883     if (class_charcount > 0)
2884     {
2885     *code++ |= XCL_MAP;
2886 nigel 93 memmove(code + 32, code, class_utf8data - code);
2887 nigel 77 memcpy(code, classbits, 32);
2888 nigel 93 code = class_utf8data + 32;
2889 nigel 77 }
2890 nigel 93 else code = class_utf8data;
2891 nigel 77
2892     /* Now fill in the complete length of the item */
2893    
2894     PUT(previous, 1, code - previous);
2895     break; /* End of class handling */
2896     }
2897     #endif
2898    
2899     /* If there are no characters > 255, negate the 32-byte map if necessary,
2900     and copy it into the code vector. If this is the first thing in the branch,
2901     there can be no first char setting, whatever the repeat count. Any reqbyte
2902     setting must remain unchanged after any kind of repeat. */
2903    
2904     if (negate_class)
2905     {
2906     *code++ = OP_NCLASS;
2907 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2908     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2909 nigel 77 }
2910     else
2911     {
2912     *code++ = OP_CLASS;
2913     memcpy(code, classbits, 32);
2914     }
2915     code += 32;
2916     break;
2917    
2918 nigel 93
2919     /* ===================================================================*/
2920 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2921     has been tested above. */
2922    
2923     case '{':
2924     if (!is_quantifier) goto NORMAL_CHAR;
2925     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2926     if (*errorcodeptr != 0) goto FAILED;
2927     goto REPEAT;
2928    
2929     case '*':
2930     repeat_min = 0;
2931     repeat_max = -1;
2932     goto REPEAT;
2933    
2934     case '+':
2935     repeat_min = 1;
2936     repeat_max = -1;
2937     goto REPEAT;
2938    
2939     case '?':
2940     repeat_min = 0;
2941     repeat_max = 1;
2942    
2943     REPEAT:
2944     if (previous == NULL)
2945     {
2946     *errorcodeptr = ERR9;
2947     goto FAILED;
2948     }
2949    
2950     if (repeat_min == 0)
2951     {
2952     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2953     reqbyte = zeroreqbyte; /* Ditto */
2954     }
2955    
2956     /* Remember whether this is a variable length repeat */
2957    
2958     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2959    
2960     op_type = 0; /* Default single-char op codes */
2961     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2962    
2963     /* Save start of previous item, in case we have to move it up to make space
2964     for an inserted OP_ONCE for the additional '+' extension. */
2965    
2966     tempcode = previous;
2967    
2968     /* If the next character is '+', we have a possessive quantifier. This
2969     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2970     If the next character is '?' this is a minimizing repeat, by default,
2971     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2972     repeat type to the non-default. */
2973    
2974     if (ptr[1] == '+')
2975     {
2976     repeat_type = 0; /* Force greedy */
2977     possessive_quantifier = TRUE;
2978     ptr++;
2979     }
2980     else if (ptr[1] == '?')
2981     {
2982     repeat_type = greedy_non_default;
2983     ptr++;
2984     }
2985     else repeat_type = greedy_default;
2986    
2987     /* If previous was a character match, abolish the item and generate a
2988     repeat item instead. If a char item has a minumum of more than one, ensure
2989     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2990     the first thing in a branch because the x will have gone into firstbyte
2991     instead. */
2992    
2993     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2994     {
2995     /* Deal with UTF-8 characters that take up more than one byte. It's
2996     easier to write this out separately than try to macrify it. Use c to
2997     hold the length of the character in bytes, plus 0x80 to flag that it's a
2998     length rather than a small character. */
2999    
3000     #ifdef SUPPORT_UTF8
3001     if (utf8 && (code[-1] & 0x80) != 0)
3002     {
3003     uschar *lastchar = code - 1;
3004     while((*lastchar & 0xc0) == 0x80) lastchar--;
3005     c = code - lastchar; /* Length of UTF-8 character */
3006     memcpy(utf8_char, lastchar, c); /* Save the char */
3007     c |= 0x80; /* Flag c as a length */
3008     }
3009     else
3010     #endif
3011    
3012     /* Handle the case of a single byte - either with no UTF8 support, or
3013     with UTF-8 disabled, or for a UTF-8 character < 128. */
3014    
3015     {
3016     c = code[-1];
3017     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3018     }
3019    
3020 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3021     the line is something that cannot possibly match this character. If so,
3022     automatically possessifying this item gains some performance in the case
3023     where the match fails. */
3024    
3025     if (!possessive_quantifier &&
3026     repeat_max < 0 &&
3027     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3028     options, cd))
3029     {
3030     repeat_type = 0; /* Force greedy */
3031     possessive_quantifier = TRUE;
3032     }
3033    
3034 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3035     }
3036    
3037     /* If previous was a single negated character ([^a] or similar), we use
3038     one of the special opcodes, replacing it. The code is shared with single-
3039     character repeats by setting opt_type to add a suitable offset into
3040 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3041     currently used only for single-byte chars. */
3042 nigel 77
3043     else if (*previous == OP_NOT)
3044     {
3045     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3046     c = previous[1];
3047 nigel 93 if (!possessive_quantifier &&
3048     repeat_max < 0 &&
3049     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3050     {
3051     repeat_type = 0; /* Force greedy */
3052     possessive_quantifier = TRUE;
3053     }
3054 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3055     }
3056    
3057     /* If previous was a character type match (\d or similar), abolish it and
3058     create a suitable repeat item. The code is shared with single-character
3059     repeats by setting op_type to add a suitable offset into repeat_type. Note
3060     the the Unicode property types will be present only when SUPPORT_UCP is
3061     defined, but we don't wrap the little bits of code here because it just
3062     makes it horribly messy. */
3063    
3064     else if (*previous < OP_EODN)
3065     {
3066     uschar *oldcode;
3067 nigel 87 int prop_type, prop_value;
3068 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3069     c = *previous;
3070    
3071 nigel 93 if (!possessive_quantifier &&
3072     repeat_max < 0 &&
3073     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3074     {
3075     repeat_type = 0; /* Force greedy */
3076     possessive_quantifier = TRUE;
3077     }
3078    
3079 nigel 77 OUTPUT_SINGLE_REPEAT:
3080 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3081     {
3082     prop_type = previous[1];
3083     prop_value = previous[2];
3084     }
3085     else prop_type = prop_value = -1;
3086 nigel 77
3087     oldcode = code;
3088     code = previous; /* Usually overwrite previous item */
3089    
3090     /* If the maximum is zero then the minimum must also be zero; Perl allows
3091     this case, so we do too - by simply omitting the item altogether. */
3092    
3093     if (repeat_max == 0) goto END_REPEAT;
3094    
3095     /* All real repeats make it impossible to handle partial matching (maybe
3096     one day we will be able to remove this restriction). */
3097    
3098     if (repeat_max != 1) cd->nopartial = TRUE;
3099    
3100     /* Combine the op_type with the repeat_type */
3101    
3102     repeat_type += op_type;
3103    
3104     /* A minimum of zero is handled either as the special case * or ?, or as
3105     an UPTO, with the maximum given. */
3106    
3107     if (repeat_min == 0)
3108     {
3109     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3110     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3111     else
3112     {
3113     *code++ = OP_UPTO + repeat_type;
3114     PUT2INC(code, 0, repeat_max);
3115     }
3116     }
3117    
3118     /* A repeat minimum of 1 is optimized into some special cases. If the
3119 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3120 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3121     one less than the maximum. */
3122    
3123     else if (repeat_min == 1)
3124     {
3125     if (repeat_max == -1)
3126     *code++ = OP_PLUS + repeat_type;
3127     else
3128     {
3129     code = oldcode; /* leave previous item in place */
3130     if (repeat_max == 1) goto END_REPEAT;
3131     *code++ = OP_UPTO + repeat_type;
3132     PUT2INC(code, 0, repeat_max - 1);
3133     }
3134     }
3135    
3136     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3137     handled as an EXACT followed by an UPTO. */
3138    
3139     else
3140     {
3141     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3142     PUT2INC(code, 0, repeat_min);
3143    
3144     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3145     we have to insert the character for the previous code. For a repeated
3146 nigel 87 Unicode property match, there are two extra bytes that define the
3147 nigel 77 required property. In UTF-8 mode, long characters have their length in
3148     c, with the 0x80 bit as a flag. */
3149    
3150     if (repeat_max < 0)
3151     {
3152     #ifdef SUPPORT_UTF8
3153     if (utf8 && c >= 128)
3154     {
3155     memcpy(code, utf8_char, c & 7);
3156     code += c & 7;
3157     }
3158     else
3159     #endif
3160     {
3161     *code++ = c;
3162 nigel 87 if (prop_type >= 0)
3163     {
3164     *code++ = prop_type;
3165     *code++ = prop_value;
3166     }
3167 nigel 77 }
3168     *code++ = OP_STAR + repeat_type;
3169     }
3170    
3171     /* Else insert an UPTO if the max is greater than the min, again
3172 nigel 93 preceded by the character, for the previously inserted code. If the
3173     UPTO is just for 1 instance, we can use QUERY instead. */
3174 nigel 77
3175     else if (repeat_max != repeat_min)
3176     {
3177     #ifdef SUPPORT_UTF8
3178     if (utf8 && c >= 128)
3179     {
3180     memcpy(code, utf8_char, c & 7);
3181     code += c & 7;
3182     }
3183     else
3184     #endif
3185     *code++ = c;
3186 nigel 87 if (prop_type >= 0)
3187     {
3188     *code++ = prop_type;
3189     *code++ = prop_value;
3190     }
3191 nigel 77 repeat_max -= repeat_min;
3192 nigel 93
3193     if (repeat_max == 1)
3194     {
3195     *code++ = OP_QUERY + repeat_type;
3196     }
3197     else
3198     {
3199     *code++ = OP_UPTO + repeat_type;
3200     PUT2INC(code, 0, repeat_max);
3201     }
3202 nigel 77 }
3203     }
3204    
3205     /* The character or character type itself comes last in all cases. */
3206    
3207     #ifdef SUPPORT_UTF8
3208     if (utf8 && c >= 128)
3209     {
3210     memcpy(code, utf8_char, c & 7);
3211     code += c & 7;
3212     }
3213     else
3214     #endif
3215     *code++ = c;
3216    
3217 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3218     define the required property. */
3219 nigel 77
3220     #ifdef SUPPORT_UCP
3221 nigel 87 if (prop_type >= 0)
3222     {
3223     *code++ = prop_type;
3224     *code++ = prop_value;
3225     }
3226 nigel 77 #endif
3227     }
3228    
3229     /* If previous was a character class or a back reference, we put the repeat
3230     stuff after it, but just skip the item if the repeat was {0,0}. */
3231    
3232     else if (*previous == OP_CLASS ||
3233     *previous == OP_NCLASS ||
3234     #ifdef SUPPORT_UTF8
3235     *previous == OP_XCLASS ||
3236     #endif
3237     *previous == OP_REF)
3238     {
3239     if (repeat_max == 0)
3240     {
3241     code = previous;
3242     goto END_REPEAT;
3243     }
3244    
3245     /* All real repeats make it impossible to handle partial matching (maybe
3246     one day we will be able to remove this restriction). */
3247    
3248     if (repeat_max != 1) cd->nopartial = TRUE;
3249    
3250     if (repeat_min == 0 && repeat_max == -1)
3251     *code++ = OP_CRSTAR + repeat_type;
3252     else if (repeat_min == 1 && repeat_max == -1)
3253     *code++ = OP_CRPLUS + repeat_type;
3254     else if (repeat_min == 0 && repeat_max == 1)
3255     *code++ = OP_CRQUERY + repeat_type;
3256     else
3257     {
3258     *code++ = OP_CRRANGE + repeat_type;
3259     PUT2INC(code, 0, repeat_min);
3260     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3261     PUT2INC(code, 0, repeat_max);
3262     }
3263     }
3264    
3265     /* If previous was a bracket group, we may have to replicate it in certain
3266     cases. */
3267    
3268 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3269     *previous == OP_ONCE || *previous == OP_COND)
3270 nigel 77 {
3271     register int i;
3272     int ketoffset = 0;
3273     int len = code - previous;
3274     uschar *bralink = NULL;
3275    
3276 nigel 93 /* Repeating a DEFINE group is pointless */
3277    
3278     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3279     {
3280     *errorcodeptr = ERR55;
3281     goto FAILED;
3282     }
3283    
3284     /* This is a paranoid check to stop integer overflow later on */
3285    
3286     if (len > MAX_DUPLENGTH)
3287     {
3288     *errorcodeptr = ERR50;
3289     goto FAILED;
3290     }
3291    
3292 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3293     by scanning through from the start, and compute the offset back to it
3294     from the current code pointer. There may be an OP_OPT setting following
3295     the final KET, so we can't find the end just by going back from the code
3296     pointer. */
3297    
3298     if (repeat_max == -1)
3299     {
3300     register uschar *ket = previous;
3301     do ket += GET(ket, 1); while (*ket != OP_KET);
3302     ketoffset = code - ket;
3303     }
3304    
3305     /* The case of a zero minimum is special because of the need to stick
3306     OP_BRAZERO in front of it, and because the group appears once in the
3307     data, whereas in other cases it appears the minimum number of times. For
3308     this reason, it is simplest to treat this case separately, as otherwise
3309     the code gets far too messy. There are several special subcases when the
3310     minimum is zero. */
3311    
3312     if (repeat_min == 0)
3313     {
3314     /* If the maximum is also zero, we just omit the group from the output
3315     altogether. */
3316    
3317     if (repeat_max == 0)
3318     {
3319     code = previous;
3320     goto END_REPEAT;
3321     }
3322    
3323     /* If the maximum is 1 or unlimited, we just have to stick in the
3324     BRAZERO and do no more at this point. However, we do need to adjust
3325     any OP_RECURSE calls inside the group that refer to the group itself or
3326 nigel 93 any internal or forward referenced group, because the offset is from
3327     the start of the whole regex. Temporarily terminate the pattern while
3328     doing this. */
3329 nigel 77
3330     if (repeat_max <= 1)
3331     {
3332     *code = OP_END;
3333 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3334 nigel 77 memmove(previous+1, previous, len);
3335     code++;
3336     *previous++ = OP_BRAZERO + repeat_type;
3337     }
3338    
3339     /* If the maximum is greater than 1 and limited, we have to replicate
3340     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3341     The first one has to be handled carefully because it's the original
3342     copy, which has to be moved up. The remainder can be handled by code
3343     that is common with the non-zero minimum case below. We have to
3344     adjust the value or repeat_max, since one less copy is required. Once
3345     again, we may have to adjust any OP_RECURSE calls inside the group. */
3346    
3347     else
3348     {
3349     int offset;
3350     *code = OP_END;
3351 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3352 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3353     code += 2 + LINK_SIZE;
3354     *previous++ = OP_BRAZERO + repeat_type;
3355     *previous++ = OP_BRA;
3356    
3357     /* We chain together the bracket offset fields that have to be
3358     filled in later when the ends of the brackets are reached. */
3359    
3360     offset = (bralink == NULL)? 0 : previous - bralink;
3361     bralink = previous;
3362     PUTINC(previous, 0, offset);
3363     }
3364    
3365     repeat_max--;
3366     }
3367    
3368     /* If the minimum is greater than zero, replicate the group as many
3369     times as necessary, and adjust the maximum to the number of subsequent
3370     copies that we need. If we set a first char from the group, and didn't
3371 nigel 93 set a required char, copy the latter from the former. If there are any
3372     forward reference subroutine calls in the group, there will be entries on
3373     the workspace list; replicate these with an appropriate increment. */
3374 nigel 77
3375     else
3376     {
3377     if (repeat_min > 1)
3378     {
3379 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3380     just adjust the length as if we had. */
3381    
3382     if (lengthptr != NULL)
3383     *lengthptr += (repeat_min - 1)*length_prevgroup;
3384    
3385     /* This is compiling for real */
3386    
3387     else
3388 nigel 77 {
3389 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3390     for (i = 1; i < repeat_min; i++)
3391     {
3392     uschar *hc;
3393     uschar *this_hwm = cd->hwm;
3394     memcpy(code, previous, len);
3395     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3396     {
3397     PUT(cd->hwm, 0, GET(hc, 0) + len);
3398     cd->hwm += LINK_SIZE;
3399     }
3400     save_hwm = this_hwm;
3401     code += len;
3402     }
3403 nigel 77 }
3404     }
3405 nigel 93
3406 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3407     }
3408    
3409     /* This code is common to both the zero and non-zero minimum cases. If
3410     the maximum is limited, it replicates the group in a nested fashion,
3411     remembering the bracket starts on a stack. In the case of a zero minimum,
3412     the first one was set up above. In all cases the repeat_max now specifies
3413 nigel 93 the number of additional copies needed. Again, we must remember to
3414     replicate entries on the forward reference list. */
3415 nigel 77
3416     if (repeat_max >= 0)
3417     {
3418 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3419     just adjust the length as if we had. For each repetition we must add 1
3420     to the length for BRAZERO and for all but the last repetition we must
3421     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3422    
3423     if (lengthptr != NULL && repeat_max > 0)
3424     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3425     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3426    
3427     /* This is compiling for real */
3428    
3429     else for (i = repeat_max - 1; i >= 0; i--)
3430 nigel 77 {
3431 nigel 93 uschar *hc;
3432     uschar *this_hwm = cd->hwm;
3433    
3434 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3435    
3436     /* All but the final copy start a new nesting, maintaining the
3437     chain of brackets outstanding. */
3438    
3439     if (i != 0)
3440     {
3441     int offset;
3442     *code++ = OP_BRA;
3443     offset = (bralink == NULL)? 0 : code - bralink;
3444     bralink = code;
3445     PUTINC(code, 0, offset);
3446     }
3447    
3448     memcpy(code, previous, len);
3449 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3450     {
3451     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3452     cd->hwm += LINK_SIZE;
3453     }
3454     save_hwm = this_hwm;
3455 nigel 77 code += len;
3456     }
3457    
3458     /* Now chain through the pending brackets, and fill in their length
3459     fields (which are holding the chain links pro tem). */
3460    
3461     while (bralink != NULL)
3462     {
3463     int oldlinkoffset;
3464     int offset = code - bralink + 1;
3465     uschar *bra = code - offset;
3466     oldlinkoffset = GET(bra, 1);
3467     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3468     *code++ = OP_KET;
3469     PUTINC(code, 0, offset);
3470     PUT(bra, 1, offset);
3471     }
3472     }
3473    
3474     /* If the maximum is unlimited, set a repeater in the final copy. We
3475     can't just offset backwards from the current code point, because we
3476     don't know if there's been an options resetting after the ket. The
3477 nigel 93 correct offset was computed above.
3478 nigel 77
3479 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3480     this group is a non-atomic one that could match an empty string. If so,
3481     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3482     that runtime checking can be done. [This check is also applied to
3483     atomic groups at runtime, but in a different way.] */
3484    
3485     else
3486     {
3487     uschar *ketcode = code - ketoffset;
3488     uschar *bracode = ketcode - GET(ketcode, 1);
3489     *ketcode = OP_KETRMAX + repeat_type;
3490     if (lengthptr == NULL && *bracode != OP_ONCE)
3491     {
3492     uschar *scode = bracode;
3493     do
3494     {
3495     if (could_be_empty_branch(scode, ketcode, utf8))
3496     {
3497     *bracode += OP_SBRA - OP_BRA;
3498     break;
3499     }
3500     scode += GET(scode, 1);
3501     }
3502     while (*scode == OP_ALT);
3503     }
3504     }
3505 nigel 77 }
3506    
3507     /* Else there's some kind of shambles */
3508    
3509     else
3510     {
3511     *errorcodeptr = ERR11;
3512     goto FAILED;
3513     }
3514    
3515 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3516     tests above succeeded, possessive_quantifier is TRUE. For some of the
3517     simpler opcodes, there is an special alternative opcode for this. For
3518     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3519     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3520     but the special opcodes can optimize it a bit. The repeated item starts at
3521     tempcode, not at previous, which might be the first part of a string whose
3522     (former) last char we repeated.
3523 nigel 77
3524 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3525     an 'upto' may follow. We skip over an 'exact' item, and then test the
3526     length of what remains before proceeding. */
3527    
3528 nigel 77 if (possessive_quantifier)
3529     {
3530 nigel 93 int len;
3531     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3532     *tempcode == OP_NOTEXACT)
3533     tempcode += _pcre_OP_lengths[*tempcode];
3534     len = code - tempcode;
3535     if (len > 0) switch (*tempcode)
3536     {
3537     case OP_STAR: *tempcode = OP_POSSTAR; break;
3538     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3539     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3540     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3541    
3542     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3543     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3544     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3545     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3546    
3547     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3548     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3549     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3550     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3551    
3552     default:
3553     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3554     code += 1 + LINK_SIZE;
3555     len += 1 + LINK_SIZE;
3556     tempcode[0] = OP_ONCE;
3557     *code++ = OP_KET;
3558     PUTINC(code, 0, len);
3559     PUT(tempcode, 1, len);
3560     break;
3561     }
3562 nigel 77 }
3563    
3564     /* In all case we no longer have a previous item. We also set the
3565     "follows varying string" flag for subsequently encountered reqbytes if
3566     it isn't already set and we have just passed a varying length item. */
3567    
3568     END_REPEAT:
3569     previous = NULL;
3570     cd->req_varyopt |= reqvary;
3571     break;
3572    
3573    
3574 nigel 93 /* ===================================================================*/
3575     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3576     lookbehind or option setting or condition or all the other extended
3577     parenthesis forms. First deal with the specials; all are introduced by ?,
3578     and the appearance of any of them means that this is not a capturing
3579     group. */
3580 nigel 77
3581     case '(':
3582     newoptions = options;
3583     skipbytes = 0;
3584 nigel 93 bravalue = OP_CBRA;
3585     save_hwm = cd->hwm;
3586 nigel 77
3587     if (*(++ptr) == '?')
3588     {
3589 nigel 93 int i, set, unset, namelen;
3590 nigel 77 int *optset;
3591 nigel 93 const uschar *name;
3592     uschar *slot;
3593 nigel 77
3594     switch (*(++ptr))
3595     {
3596     case '#': /* Comment; skip to ket */
3597     ptr++;
3598 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3599     if (*ptr == 0)
3600     {
3601     *errorcodeptr = ERR18;
3602     goto FAILED;
3603     }
3604 nigel 77 continue;
3605    
3606 nigel 93
3607     /* ------------------------------------------------------------ */
3608     case ':': /* Non-capturing bracket */
3609 nigel 77 bravalue = OP_BRA;
3610     ptr++;
3611     break;
3612    
3613 nigel 93
3614     /* ------------------------------------------------------------ */
3615 nigel 77 case '(':
3616     bravalue = OP_COND; /* Conditional group */
3617    
3618 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3619     group), a name (referring to a named group), or 'R', referring to
3620     recursion. R<digits> and R&name are also permitted for recursion tests.
3621 nigel 77
3622 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3623     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3624    
3625     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3626     be the recursive thing or the name 'R' (and similarly for 'R' followed
3627     by digits), and (b) a number could be a name that consists of digits.
3628     In both cases, we look for a name first; if not found, we try the other
3629     cases. */
3630    
3631     /* For conditions that are assertions, check the syntax, and then exit
3632     the switch. This will take control down to where bracketed groups,
3633     including assertions, are processed. */
3634    
3635     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3636     break;
3637    
3638     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3639     below), and all need to skip 3 bytes at the start of the group. */
3640    
3641     code[1+LINK_SIZE] = OP_CREF;
3642     skipbytes = 3;
3643 ph10 167 refsign = -1;
3644 nigel 93
3645     /* Check for a test for recursion in a named group. */
3646    
3647     if (ptr[1] == 'R' && ptr[2] == '&')
3648 nigel 77 {
3649 nigel 93 terminator = -1;
3650     ptr += 2;
3651     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3652     }
3653 nigel 91
3654 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3655     syntax (?(<name>) or (?('name') */
3656 nigel 91
3657 nigel 93 else if (ptr[1] == '<')
3658     {
3659     terminator = '>';
3660     ptr++;
3661     }
3662     else if (ptr[1] == '\'')
3663     {
3664     terminator = '\'';
3665     ptr++;
3666     }
3667 ph10 167 else
3668     {
3669     terminator = 0;
3670     if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3671     }
3672 nigel 77
3673 nigel 93 /* We now expect to read a name; any thing else is an error */
3674 nigel 77
3675 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3676     {
3677     ptr += 1; /* To get the right offset */
3678     *errorcodeptr = ERR28;
3679     goto FAILED;
3680     }
3681    
3682     /* Read the name, but also get it as a number if it's all digits */
3683    
3684     recno = 0;
3685     name = ++ptr;
3686     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3687     {
3688     if (recno >= 0)
3689     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3690     recno * 10 + *ptr - '0' : -1;
3691 nigel 91 ptr++;
3692 nigel 93 }
3693     namelen = ptr - name;
3694 nigel 91
3695 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3696     {
3697     ptr--; /* Error offset */
3698     *errorcodeptr = ERR26;
3699     goto FAILED;
3700     }
3701 nigel 91
3702 nigel 93 /* Do no further checking in the pre-compile phase. */
3703 nigel 91
3704 nigel 93 if (lengthptr != NULL) break;
3705 nigel 91
3706 nigel 93 /* In the real compile we do the work of looking for the actual
3707 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3708     be digits, in which case recno will be set. */
3709    
3710     if (refsign > 0)
3711     {
3712     if (recno <= 0)
3713     {
3714     *errorcodeptr = ERR58;
3715     goto FAILED;
3716     }
3717     if (refsign == '-')
3718     {
3719     recno = cd->bracount - recno + 1;
3720     if (recno <= 0)
3721     {
3722     *errorcodeptr = ERR15;
3723     goto FAILED;
3724     }
3725     }
3726     else recno += cd->bracount;
3727     PUT2(code, 2+LINK_SIZE, recno);
3728     break;
3729     }
3730 nigel 91
3731 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3732     name. */
3733    
3734 nigel 93 slot = cd->name_table;
3735     for (i = 0; i < cd->names_found; i++)
3736     {
3737     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3738     slot += cd->name_entry_size;
3739     }
3740 nigel 91
3741 nigel 93 /* Found a previous named subpattern */
3742 nigel 91
3743 nigel 93 if (i < cd->names_found)
3744     {
3745     recno = GET2(slot, 0);
3746     PUT2(code, 2+LINK_SIZE, recno);
3747     }
3748 nigel 91
3749 nigel 93 /* Search the pattern for a forward reference */
3750 nigel 91
3751 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3752     (options & PCRE_EXTENDED) != 0)) > 0)
3753     {
3754     PUT2(code, 2+LINK_SIZE, i);
3755     }
3756 nigel 91
3757 nigel 93 /* If terminator == 0 it means that the name followed directly after
3758     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3759     some further alternatives to try. For the cases where terminator != 0
3760     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3761     now checked all the possibilities, so give an error. */
3762 nigel 91
3763 nigel 93 else if (terminator != 0)
3764     {
3765     *errorcodeptr = ERR15;
3766     goto FAILED;
3767     }
3768    
3769     /* Check for (?(R) for recursion. Allow digits after R to specify a
3770     specific group number. */
3771    
3772     else if (*name == 'R')
3773     {
3774     recno = 0;
3775     for (i = 1; i < namelen; i++)
3776 nigel 91 {
3777 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3778     {
3779     *errorcodeptr = ERR15;
3780     goto FAILED;
3781     }
3782     recno = recno * 10 + name[i] - '0';
3783 nigel 77 }
3784 nigel 93 if (recno == 0) recno = RREF_ANY;
3785     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3786     PUT2(code, 2+LINK_SIZE, recno);
3787 nigel 77 }
3788 nigel 91
3789 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3790     false. */
3791 nigel 91
3792 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3793     {
3794     code[1+LINK_SIZE] = OP_DEF;
3795     skipbytes = 1;
3796     }
3797    
3798     /* Check for the "name" actually being a subpattern number. */
3799    
3800     else if (recno > 0)
3801     {
3802     PUT2(code, 2+LINK_SIZE, recno);
3803     }
3804    
3805     /* Either an unidentified subpattern, or a reference to (?(0) */
3806    
3807     else
3808     {
3809     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3810     goto FAILED;
3811     }
3812 nigel 77 break;
3813    
3814 nigel 93
3815     /* ------------------------------------------------------------ */
3816 nigel 77 case '=': /* Positive lookahead */
3817     bravalue = OP_ASSERT;
3818     ptr++;
3819     break;
3820    
3821 nigel 93
3822     /* ------------------------------------------------------------ */
3823 nigel 77 case '!': /* Negative lookahead */
3824     bravalue = OP_ASSERT_NOT;
3825     ptr++;
3826     break;
3827    
3828 nigel 93
3829     /* ------------------------------------------------------------ */
3830     case '<': /* Lookbehind or named define */
3831     switch (ptr[1])
3832 nigel 77 {
3833     case '=': /* Positive lookbehind */
3834     bravalue = OP_ASSERTBACK;
3835 nigel 93 ptr += 2;
3836 nigel 77 break;
3837    
3838     case '!': /* Negative lookbehind */
3839     bravalue = OP_ASSERTBACK_NOT;
3840 nigel 93 ptr += 2;
3841 nigel 77 break;
3842 nigel 93
3843     default: /* Could be name define, else bad */
3844     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3845     ptr++; /* Correct offset for error */
3846     *errorcodeptr = ERR24;
3847     goto FAILED;
3848 nigel 77 }
3849     break;
3850    
3851 nigel 93
3852     /* ------------------------------------------------------------ */
3853 nigel 77 case '>': /* One-time brackets */
3854     bravalue = OP_ONCE;
3855     ptr++;
3856     break;
3857    
3858 nigel 93
3859     /* ------------------------------------------------------------ */
3860 nigel 77 case 'C': /* Callout - may be followed by digits; */
3861     previous_callout = code; /* Save for later completion */
3862     after_manual_callout = 1; /* Skip one item before completing */
3863 nigel 93 *code++ = OP_CALLOUT;
3864     {
3865 nigel 77 int n = 0;
3866     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3867     n = n * 10 + *ptr - '0';
3868 nigel 93 if (*ptr != ')')
3869     {
3870     *errorcodeptr = ERR39;
3871     goto FAILED;
3872     }
3873 nigel 77 if (n > 255)
3874     {
3875     *errorcodeptr = ERR38;
3876     goto FAILED;
3877     }
3878     *code++ = n;
3879     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3880     PUT(code, LINK_SIZE, 0); /* Default length */
3881     code += 2 * LINK_SIZE;
3882     }
3883     previous = NULL;
3884     continue;
3885    
3886 nigel 93
3887     /* ------------------------------------------------------------ */
3888     case 'P': /* Python-style named subpattern handling */
3889     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3890 nigel 77 {
3891 nigel 93 is_recurse = *ptr == '>';
3892     terminator = ')';
3893     goto NAMED_REF_OR_RECURSE;
3894     }
3895     else if (*ptr != '<') /* Test for Python-style definition */
3896     {
3897     *errorcodeptr = ERR41;
3898     goto FAILED;
3899     }
3900     /* Fall through to handle (?P< as (?< is handled */
3901 nigel 77
3902    
3903 nigel 93 /* ------------------------------------------------------------ */
3904     DEFINE_NAME: /* Come here from (?< handling */
3905     case '\'':
3906     {
3907     terminator = (*ptr == '<')? '>' : '\'';
3908     name = ++ptr;
3909    
3910     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3911     namelen = ptr - name;
3912    
3913     /* In the pre-compile phase, just do a syntax check. */
3914    
3915     if (lengthptr != NULL)
3916 nigel 77 {
3917 nigel 93 if (*ptr != terminator)
3918 nigel 77 {
3919 nigel 93 *errorcodeptr = ERR42;
3920     goto FAILED;
3921     }
3922     if (cd->names_found >= MAX_NAME_COUNT)
3923     {
3924     *errorcodeptr = ERR49;
3925     goto FAILED;
3926     }
3927     if (namelen + 3 > cd->name_entry_size)
3928     {
3929     cd->name_entry_size = namelen + 3;
3930     if (namelen > MAX_NAME_SIZE)
3931 nigel 77 {
3932 nigel 93 *errorcodeptr = ERR48;
3933     goto FAILED;
3934     }
3935     }
3936     }
3937    
3938     /* In the real compile, create the entry in the table */
3939    
3940     else
3941     {
3942     slot = cd->name_table;
3943     for (i = 0; i < cd->names_found; i++)
3944     {
3945     int crc = memcmp(name, slot+2, namelen);
3946     if (crc == 0)
3947     {
3948     if (slot[2+namelen] == 0)
3949 nigel 91 {
3950 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3951     {
3952     *errorcodeptr = ERR43;
3953     goto FAILED;
3954     }
3955 nigel 91 }
3956 nigel 93 else crc = -1; /* Current name is substring */
3957 nigel 77 }
3958 nigel 93 if (crc < 0)
3959     {
3960     memmove(slot + cd->name_entry_size, slot,
3961     (cd->names_found - i) * cd->name_entry_size);
3962     break;
3963     }
3964     slot += cd->name_entry_size;
3965 nigel 77 }
3966 nigel 93
3967     PUT2(slot, 0, cd->bracount + 1);
3968     memcpy(slot + 2, name, namelen);
3969     slot[2+namelen] = 0;
3970 nigel 77 }
3971     }
3972    
3973 nigel 93 /* In both cases, count the number of names we've encountered. */
3974    
3975     ptr++; /* Move past > or ' */
3976     cd->names_found++;
3977     goto NUMBERED_GROUP;
3978    
3979    
3980     /* ------------------------------------------------------------ */
3981     case '&': /* Perl recursion/subroutine syntax */
3982     terminator = ')';
3983     is_recurse = TRUE;
3984     /* Fall through */
3985    
3986     /* We come here from the Python syntax above that handles both
3987     references (?P=name) and recursion (?P>name), as well as falling
3988     through from the Perl recursion syntax (?&name). */
3989    
3990     NAMED_REF_OR_RECURSE:
3991     name = ++ptr;
3992     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3993     namelen = ptr - name;
3994    
3995     /* In the pre-compile phase, do a syntax check and set a dummy
3996     reference number. */
3997    
3998     if (lengthptr != NULL)
3999 nigel 77 {
4000 nigel 93 if (*ptr != terminator)
4001     {
4002     *errorcodeptr = ERR42;
4003     goto FAILED;
4004     }
4005     if (namelen > MAX_NAME_SIZE)
4006     {
4007     *errorcodeptr = ERR48;
4008     goto FAILED;
4009     }
4010     recno = 0;
4011     }
4012 nigel 77
4013 nigel 93 /* In the real compile, seek the name in the table */
4014 nigel 77
4015 nigel 93 else
4016     {
4017     slot = cd->name_table;
4018 nigel 77 for (i = 0; i < cd->names_found; i++)
4019     {
4020     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4021     slot += cd->name_entry_size;
4022     }
4023 nigel 91
4024     if (i < cd->names_found) /* Back reference */
4025 nigel 77 {
4026 nigel 91 recno = GET2(slot, 0);
4027     }
4028     else if ((recno = /* Forward back reference */
4029 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
4030     (options & PCRE_EXTENDED) != 0)) <= 0)
4031 nigel 91 {
4032 nigel 77 *errorcodeptr = ERR15;
4033     goto FAILED;
4034     }
4035 nigel 93 }
4036 nigel 77
4037 nigel 93 /* In both phases, we can now go to the code than handles numerical
4038     recursion or backreferences. */
4039 nigel 77
4040 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
4041     else goto HANDLE_REFERENCE;
4042 nigel 77
4043    
4044 nigel 93 /* ------------------------------------------------------------ */
4045     case 'R': /* Recursion */
4046 nigel 77 ptr++; /* Same as (?0) */
4047     /* Fall through */
4048    
4049    
4050 nigel 93 /* ------------------------------------------------------------ */
4051 ph10 166 case '-': case '+':
4052 nigel 93 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4053     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4054 nigel 77 {
4055     const uschar *called;
4056 ph10 166
4057 ph10 167 if ((refsign = *ptr) == '+') ptr++;
4058     else if (refsign == '-')
4059 ph10 166 {
4060     if ((digitab[ptr[1]] & ctype_digit) == 0)
4061     goto OTHER_CHAR_AFTER_QUERY;
4062     ptr++;
4063     }
4064    
4065 nigel 77 recno = 0;
4066     while((digitab[*ptr] & ctype_digit) != 0)
4067     recno = recno * 10 + *ptr++ - '0';
4068 ph10 166
4069 nigel 93 if (*ptr != ')')
4070     {
4071     *errorcodeptr = ERR29;
4072     goto FAILED;
4073     }
4074 ph10 166
4075 ph10 167 if (refsign == '-')
4076 ph10 166 {
4077     if (recno == 0)
4078     {
4079     *errorcodeptr = ERR58;
4080     goto FAILED;
4081     }
4082     recno = cd->bracount - recno + 1;
4083     if (recno <= 0)
4084     {
4085     *errorcodeptr = ERR15;
4086     goto FAILED;
4087     }
4088     }
4089 ph10 167 else if (refsign == '+')
4090 ph10 166 {
4091     if (recno == 0)
4092     {
4093     *errorcodeptr = ERR58;
4094     goto FAILED;
4095     }
4096     recno += cd->bracount;
4097     }
4098 nigel 77
4099     /* Come here from code above that handles a named recursion */
4100    
4101     HANDLE_RECURSION:
4102    
4103     previous = code;
4104 nigel 93 called = cd->start_code;
4105 nigel 77
4106 nigel 93 /* When we are actually compiling, find the bracket that is being
4107     referenced. Temporarily end the regex in case it doesn't exist before
4108     this point. If we end up with a forward reference, first check that
4109     the bracket does occur later so we can give the error (and position)
4110     now. Then remember this forward reference in the workspace so it can
4111     be filled in at the end. */
4112 nigel 77
4113 nigel 93 if (lengthptr == NULL)
4114 nigel 77 {
4115 nigel 93 *code = OP_END;
4116     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4117 nigel 77
4118 nigel 93 /* Forward reference */
4119 nigel 77
4120 nigel 93 if (called == NULL)
4121     {
4122     if (find_parens(ptr, cd->bracount, NULL, recno,
4123     (options & PCRE_EXTENDED) != 0) < 0)
4124     {
4125     *errorcodeptr = ERR15;
4126     goto FAILED;
4127     }
4128     called = cd->start_code + recno;
4129     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4130     }
4131    
4132     /* If not a forward reference, and the subpattern is still open,
4133     this is a recursive call. We check to see if this is a left
4134     recursion that could loop for ever, and diagnose that case. */
4135    
4136     else if (GET(called, 1) == 0 &&
4137     could_be_empty(called, code, bcptr, utf8))
4138     {
4139     *errorcodeptr = ERR40;
4140     goto FAILED;
4141     }
4142 nigel 77 }
4143    
4144 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
4145 nigel 93 "once" brackets. Set up a "previous group" length so that a
4146     subsequent quantifier will work. */
4147 nigel 77
4148 nigel 87 *code = OP_ONCE;
4149     PUT(code, 1, 2 + 2*LINK_SIZE);
4150     code += 1 + LINK_SIZE;
4151    
4152 nigel 77 *code = OP_RECURSE;
4153     PUT(code, 1, called - cd->start_code);
4154     code += 1 + LINK_SIZE;
4155 nigel 87
4156     *code = OP_KET;
4157     PUT(code, 1, 2 + 2*LINK_SIZE);
4158     code += 1 + LINK_SIZE;
4159 nigel 93
4160     length_prevgroup = 3 + 3*LINK_SIZE;
4161 nigel 77 }
4162 nigel 93
4163